From f34194c015612b3c5096af0dbf72ff2db8fe1689 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Sat, 31 Jul 2021 01:22:16 +0800
Subject: [PATCH] Add ncnn deployment examples (#145)

* Add ncnn deployment examples

* fix:add space to depth function (#146)

* Fixing lint

* Fixing C++ return bug

* Fixing lint and add more tests for space_to_depth

* Fixing TypeError of torch.Size

* Adding onnx export tools

* Refactor YOLODeployFriendly

* Move export_onnx.py to ncnn/tools

* Adapt to yolov5

* Remove tools

* Add yolort ncnn param examples and minor fixes

* Rename to yolort-opt.param

* Fixing lint

Co-authored-by: xiguadong <55774832+xiguadong@users.noreply.github.com>
---
 deployment/ncnn/CMakeLists.txt                |  25 +
 deployment/ncnn/README.md                     |  48 ++
 deployment/ncnn/export_onnx.py                |  62 +++
 deployment/ncnn/main.cpp                      | 509 ++++++++++++++++++
 deployment/ncnn/tools/__init__.py             |   0
 .../ncnn/tools/yolort_deploy_friendly.py      |  72 +++
 deployment/ncnn/yolort-opt.param              | 178 ++++++
 test/test_models_common.py                    |  11 +
 yolort/models/common.py                       |   9 +
 9 files changed, 914 insertions(+)
 create mode 100644 deployment/ncnn/CMakeLists.txt
 create mode 100644 deployment/ncnn/README.md
 create mode 100644 deployment/ncnn/export_onnx.py
 create mode 100644 deployment/ncnn/main.cpp
 create mode 100644 deployment/ncnn/tools/__init__.py
 create mode 100644 deployment/ncnn/tools/yolort_deploy_friendly.py
 create mode 100644 deployment/ncnn/yolort-opt.param
 create mode 100644 test/test_models_common.py

diff --git a/deployment/ncnn/CMakeLists.txt b/deployment/ncnn/CMakeLists.txt
new file mode 100644
index 00000000..839b141b
--- /dev/null
+++ b/deployment/ncnn/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(yolort_ncnn)
+
+find_package(OpenCV REQUIRED)
+
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    config: ${OpenCV_DIR}")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+find_package(ncnn REQUIRED)
+
+FILE(GLOB YOLO_SOURCE_FILES *.cpp)
+
+add_executable(yolort_ncnn ${YOLO_SOURCE_FILES})
+
+target_compile_features(yolort_ncnn PUBLIC cxx_range_for)
+
+target_link_libraries(yolort_ncnn ncnn ${OpenCV_LIBS})
diff --git a/deployment/ncnn/README.md b/deployment/ncnn/README.md
new file mode 100644
index 00000000..fcca9f08
--- /dev/null
+++ b/deployment/ncnn/README.md
@@ -0,0 +1,48 @@
+# Ncnn Inference
+
+The ncnn inference for `yolort`, both GPU and CPU are supported.
+
+## Dependencies
+
+- Ubuntu 18.04
+- ncnn
+- OpenCV 3.4+
+
+## Usage
+
+1. First, Setup the environment variables.
+
+    ```bash
+    export TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)"))
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TORCH_PATH/lib/
+    ```
+
+1. First, compile `ncnn` using the following scripts.
+
+    ```bash
+    git clone --recursive git@github.com:Tencent/ncnn.git
+    cd ncnn
+    mkdir build && cd build
+    cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_SYSTEM_GLSLANG=ON -DNCNN_BUILD_EXAMPLES=ON .. # Set -DNCNN_VULKAN=ON if you're using VULKAN
+    make -j4
+    make install
+    ```
+
+    Or follow the [official instructions](https://github.com/Tencent/ncnn/wiki/how-to-build) to install ncnn.
+
+1. Then compile the source code.
+
+    ```bash
+    cd deployment/ncnn
+    mkdir build && cd build
+    cmake .. -Dncnn_DIR=<ncnn_install_dir>/lib/cmake/ncnn/
+    make
+    ```
+
+_Note: you have to change <ncnn_install_dir> to your machine's directory, it is the directory that contains ncnnConfig.cmake, if you are following the above operations, you should set it to <./ncnn/build/install>_
+
+1. Now, you can infer your own images with ncnn.
+
+    ```bash
+    ./yolort_ncnn ../../../test/assets/zidane.jpg
+    ```
diff --git a/deployment/ncnn/export_onnx.py b/deployment/ncnn/export_onnx.py
new file mode 100644
index 00000000..8d290bac
--- /dev/null
+++ b/deployment/ncnn/export_onnx.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021, Zhiqiang Wang. All Rights Reserved.
+import argparse
+import torch
+from tools.yolort_deploy_friendly import yolov5s_r40_deploy_ncnn
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weights', type=str, default='./yolov5s.pt',
+                        help='weights path')
+    parser.add_argument('--output_path', type=str, default='./yolov5s.onnx',
+                        help='path of exported onnx')
+    parser.add_argument('--img_size', nargs='+', type=int, default=[640, 640],
+                        help='image (height, width)')
+    parser.add_argument('--num_classes', type=int, default=80,
+                        help='number of classes')
+    parser.add_argument('--batch_size', type=int, default=1,
+                        help='batch size')
+    parser.add_argument('--device', default='cpu',
+                        help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--half', action='store_true',
+                        help='FP16 half-precision export')
+    parser.add_argument('--dynamic', action='store_true',
+                        help='ONNX: dynamic axes')
+    parser.add_argument('--simplify', action='store_true',
+                        help='ONNX: simplify model')
+    parser.add_argument('--opset', type=int, default=11,
+                        help='ONNX: opset version')
+    return parser
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    print(args)
+    export_onnx(args)
+
+
+def export_onnx(args):
+
+    model = yolov5s_r40_deploy_ncnn(
+        pretrained=True,
+        num_classes=args.num_classes,
+    )
+    img = torch.rand(args.batch_size, 3, 640, 640)
+    outputs = model(img)
+    assert len(outputs) == 3
+
+    torch.onnx.export(
+        model,
+        img,
+        args.output_path,
+        verbose=False,
+        opset_version=args.opset,
+        do_constant_folding=True,
+        input_names=['images'],
+        output_names=['h1', 'h2', 'h3'],
+    )
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/deployment/ncnn/main.cpp b/deployment/ncnn/main.cpp
new file mode 100644
index 00000000..c72f2d77
--- /dev/null
+++ b/deployment/ncnn/main.cpp
@@ -0,0 +1,509 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+class YOLOv5Focus : public ncnn::Layer
+{
+public:
+  YOLOv5Focus()
+  {
+    one_blob_only = true;
+  }
+
+  virtual int forward(
+      const ncnn::Mat& bottom_blob,
+      ncnn::Mat& top_blob,
+      const ncnn::Option& opt) const
+  {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = w / 2;
+    int outh = h / 2;
+    int outc = channels * 4;
+
+    top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+    if (top_blob.empty())
+      return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outc; p++)
+    {
+      const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+      float* outptr = top_blob.channel(p);
+
+      for (int i = 0; i < outh; i++)
+      {
+        for (int j = 0; j < outw; j++)
+        {
+          *outptr = *ptr;
+
+          outptr += 1;
+          ptr += 2;
+        }
+
+        ptr += w;
+      }
+    }
+
+    return 0;
+  }
+};
+
+DEFINE_LAYER_CREATOR(YOLOv5Focus)
+
+struct Object
+{
+  cv::Rect_<float> rect;
+  int label;
+  float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+  cv::Rect_<float> inter = a.rect & b.rect;
+  return inter.area();
+}
+
+static void qsort_descent_inplace(
+    std::vector<Object>& faceobjects,
+    int left,
+    int right)
+{
+  int i = left;
+  int j = right;
+  float p = faceobjects[(left + right) / 2].prob;
+
+  while (i <= j)
+  {
+    while (faceobjects[i].prob > p)
+      i++;
+
+    while (faceobjects[j].prob < p)
+      j--;
+
+    if (i <= j)
+    {
+      // swap
+      std::swap(faceobjects[i], faceobjects[j]);
+
+      i++;
+      j--;
+    }
+  }
+
+  #pragma omp parallel sections
+  {
+    #pragma omp section
+    {
+      if (left < j) qsort_descent_inplace(faceobjects, left, j);
+    }
+    #pragma omp section
+    {
+      if (i < right) qsort_descent_inplace(faceobjects, i, right);
+    }
+  }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+  if (objects.empty())
+    return;
+
+  qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(
+    const std::vector<Object>& faceobjects,
+    std::vector<int>& picked,
+    float nms_threshold)
+{
+  picked.clear();
+
+  const int n = faceobjects.size();
+
+  std::vector<float> areas(n);
+  for (int i = 0; i < n; i++)
+  {
+    areas[i] = faceobjects[i].rect.area();
+  }
+
+  for (int i = 0; i < n; i++)
+  {
+    const Object& a = faceobjects[i];
+
+    int keep = 1;
+    for (int j = 0; j < (int)picked.size(); j++)
+    {
+      const Object& b = faceobjects[picked[j]];
+
+      // intersection over union
+      float inter_area = intersection_area(a, b);
+      float union_area = areas[i] + areas[picked[j]] - inter_area;
+      // float IoU = inter_area / union_area
+      if (inter_area / union_area > nms_threshold)
+        keep = 0;
+    }
+
+    if (keep)
+      picked.push_back(i);
+  }
+}
+
+static inline float sigmoid(float x)
+{
+  return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(
+    const ncnn::Mat& anchors,
+    int stride,
+    const ncnn::Mat& in_pad,
+    const ncnn::Mat& feat_blob,
+    float prob_threshold,
+    std::vector<Object>& objects)
+{
+  const int num_grid = feat_blob.h;
+
+  int num_grid_x;
+  int num_grid_y;
+  if (in_pad.w > in_pad.h)
+  {
+    num_grid_x = in_pad.w / stride;
+    num_grid_y = num_grid / num_grid_x;
+  }
+  else
+  {
+    num_grid_y = in_pad.h / stride;
+    num_grid_x = num_grid / num_grid_y;
+  }
+
+  const int num_class = feat_blob.w - 5;
+
+  const int num_anchors = anchors.w / 2;
+
+  for (int q = 0; q < num_anchors; q++)
+  {
+    const float anchor_w = anchors[q * 2];
+    const float anchor_h = anchors[q * 2 + 1];
+
+    const ncnn::Mat feat = feat_blob.channel(q);
+
+    for (int i = 0; i < num_grid_y; i++)
+    {
+      for (int j = 0; j < num_grid_x; j++)
+      {
+        const float* featptr = feat.row(i * num_grid_x + j);
+
+        // find class index with max class score
+        int class_index = 0;
+        float class_score = -FLT_MAX;
+        for (int k = 0; k < num_class; k++)
+        {
+          float score = featptr[5 + k];
+          if (score > class_score)
+          {
+            class_index = k;
+            class_score = score;
+          }
+        }
+
+        float box_score = featptr[4];
+
+        float confidence = sigmoid(box_score) * sigmoid(class_score);
+
+        if (confidence >= prob_threshold)
+        {
+          // yolov5/models/yolo.py Detect forward
+          // y = x[i].sigmoid()
+          // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+          // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+          float dx = sigmoid(featptr[0]);
+          float dy = sigmoid(featptr[1]);
+          float dw = sigmoid(featptr[2]);
+          float dh = sigmoid(featptr[3]);
+
+          float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+          float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+          float pb_w = pow(dw * 2.f, 2) * anchor_w;
+          float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+          float x0 = pb_cx - pb_w * 0.5f;
+          float y0 = pb_cy - pb_h * 0.5f;
+          float x1 = pb_cx + pb_w * 0.5f;
+          float y1 = pb_cy + pb_h * 0.5f;
+
+          Object obj;
+          obj.rect.x = x0;
+          obj.rect.y = y0;
+          obj.rect.width = x1 - x0;
+          obj.rect.height = y1 - y0;
+          obj.label = class_index;
+          obj.prob = confidence;
+
+          objects.push_back(obj);
+        }
+      }
+    }
+  }
+}
+
+static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+  ncnn::Net yolov5;
+
+  yolov5.opt.use_vulkan_compute = true;
+  // yolov5.opt.use_bf16_storage = true;
+
+  yolov5.register_custom_layer("YOLOv5Focus", YOLOv5Focus_layer_creator);
+
+  // original pretrained model from https://github.com/ultralytics/yolov5
+  // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+  yolov5.load_param("yolort-opt.param");
+  yolov5.load_model("yolort-opt.bin");
+
+  const int target_size = 640;
+  const float prob_threshold = 0.25f;
+  const float nms_threshold = 0.45f;
+
+  int img_w = bgr.cols;
+  int img_h = bgr.rows;
+
+  // letterbox pad to multiple of 32
+  int w = img_w;
+  int h = img_h;
+  float scale = 1.f;
+  if (w > h)
+  {
+    scale = (float)target_size / w;
+    w = target_size;
+    h = h * scale;
+  }
+  else
+  {
+    scale = (float)target_size / h;
+    h = target_size;
+    w = w * scale;
+  }
+
+  ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB,
+      img_w, img_h, w, h);
+
+  // pad to target_size rectangle
+  // yolov5/utils/datasets.py letterbox
+  int wpad = (w + 31) / 32 * 32 - w;
+  int hpad = (h + 31) / 32 * 32 - h;
+  ncnn::Mat in_pad;
+  ncnn::copy_make_border(
+      in,
+      in_pad,
+      hpad / 2,
+      hpad - hpad / 2,
+      wpad / 2,
+      wpad - wpad / 2,
+      ncnn::BORDER_CONSTANT,
+      114.f);
+
+  const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+  in_pad.substract_mean_normalize(0, norm_vals);
+
+  ncnn::Extractor ex = yolov5.create_extractor();
+
+  ex.input("images", in_pad);
+
+  std::vector<Object> proposals;
+
+  // anchor setting from yolov5/models/yolov5s.yaml
+
+  // stride 8
+  {
+    ncnn::Mat out;
+    ex.extract("h1", out);
+
+    ncnn::Mat anchors(6);
+    anchors[0] = 10.f;
+    anchors[1] = 13.f;
+    anchors[2] = 16.f;
+    anchors[3] = 30.f;
+    anchors[4] = 33.f;
+    anchors[5] = 23.f;
+
+    std::vector<Object> objects8;
+    generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+    proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+  }
+
+  // stride 16
+  {
+    ncnn::Mat out;
+    ex.extract("h2", out);
+
+    ncnn::Mat anchors(6);
+    anchors[0] = 30.f;
+    anchors[1] = 61.f;
+    anchors[2] = 62.f;
+    anchors[3] = 45.f;
+    anchors[4] = 59.f;
+    anchors[5] = 119.f;
+
+    std::vector<Object> objects16;
+    generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+    proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+  }
+
+  // stride 32
+  {
+    ncnn::Mat out;
+    ex.extract("h3", out);
+
+    ncnn::Mat anchors(6);
+    anchors[0] = 116.f;
+    anchors[1] = 90.f;
+    anchors[2] = 156.f;
+    anchors[3] = 198.f;
+    anchors[4] = 373.f;
+    anchors[5] = 326.f;
+
+    std::vector<Object> objects32;
+    generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+    proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+  }
+
+  // sort all proposals by score from highest to lowest
+  qsort_descent_inplace(proposals);
+
+  // apply nms with nms_threshold
+  std::vector<int> picked;
+  nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+  int count = picked.size();
+
+  objects.resize(count);
+  for (int i = 0; i < count; i++)
+  {
+    objects[i] = proposals[picked[i]];
+
+    // adjust offset to original unpadded
+    float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+    float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+    float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+    float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+    // clip
+    x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+    y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+    x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+    y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+    objects[i].rect.x = x0;
+    objects[i].rect.y = y0;
+    objects[i].rect.width = x1 - x0;
+    objects[i].rect.height = y1 - y0;
+  }
+
+  return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+  static const char* class_names[] = {
+      "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
+      "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
+      "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+      "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+      "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+      "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
+      "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
+      "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
+      "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+      "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
+      "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
+  };
+
+  cv::Mat image = bgr.clone();
+
+  for (size_t i = 0; i < objects.size(); i++)
+  {
+    const Object& obj = objects[i];
+
+    fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+        obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+    cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+    int baseLine = 0;
+    cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+    int x = obj.rect.x;
+    int y = obj.rect.y - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+        cv::Size(label_size.width, label_size.height + baseLine)),
+        cv::Scalar(255, 255, 255), -1);
+
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+        cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+  }
+}
+
+int main(int argc, char** argv)
+{
+  if (argc != 2)
+  {
+    fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+    return -1;
+  }
+
+  const char* imagepath = argv[1];
+
+  cv::Mat m = cv::imread(imagepath, 1);
+  if (m.empty())
+  {
+    fprintf(stderr, "cv::imread %s failed\n", imagepath);
+    return -1;
+  }
+
+  std::vector<Object> objects;
+  detect_yolov5(m, objects);
+
+  draw_objects(m, objects);
+
+  return 0;
+}
diff --git a/deployment/ncnn/tools/__init__.py b/deployment/ncnn/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deployment/ncnn/tools/yolort_deploy_friendly.py b/deployment/ncnn/tools/yolort_deploy_friendly.py
new file mode 100644
index 00000000..0d011799
--- /dev/null
+++ b/deployment/ncnn/tools/yolort_deploy_friendly.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021, Zhiqiang Wang. All Rights Reserved.
+from torch import nn, Tensor
+
+from torchvision.models.utils import load_state_dict_from_url
+
+from yolort.models.yolo import YOLO, model_urls
+from yolort.models.backbone_utils import darknet_pan_backbone
+
+from typing import Any, List, Optional
+
+
+class YOLODeployFriendly(YOLO):
+    """
+    Deployment Friendly Wrapper of YOLO.
+    """
+    def __init__(
+        self,
+        backbone: nn.Module,
+        num_classes: int,
+        # Anchor parameters
+        anchor_grids: Optional[List[List[float]]] = None,
+        anchor_generator: Optional[nn.Module] = None,
+        head: Optional[nn.Module] = None,
+    ):
+        super().__init__(backbone, num_classes, anchor_grids=anchor_grids,
+                         anchor_generator=anchor_generator, head=head)
+
+    def forward(self, samples: Tensor):
+        """
+        Arguments:
+            samples (Tensor): batched images, of shape [batch_size x 3 x H x W]
+        """
+        # get the features from the backbone
+        features = self.backbone(samples)
+
+        # compute the yolo heads outputs using the features
+        head_outputs = self.head(features)
+        return head_outputs
+
+
+def yolov5s_r40_deploy_ncnn(
+    pretrained: bool = False,
+    progress: bool = True,
+    num_classes: int = 80,
+    **kwargs: Any,
+) -> YOLODeployFriendly:
+    """
+    Deployment friendly Wrapper of yolov5s for ncnn.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    backbone_name = 'darknet_s_r4_0'
+    weights_name = 'yolov5_darknet_pan_s_r40_coco'
+    depth_multiple = 0.33
+    width_multiple = 0.5
+    version = 'r4.0'
+
+    backbone = darknet_pan_backbone(backbone_name, depth_multiple, width_multiple, version=version)
+
+    model = YOLODeployFriendly(backbone, num_classes, **kwargs)
+    if pretrained:
+        if model_urls.get(weights_name, None) is None:
+            raise ValueError(f"No checkpoint is available for model {weights_name}")
+        state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress)
+        model.load_state_dict(state_dict)
+
+    del model.anchor_generator
+    del model.post_process
+
+    return model
diff --git a/deployment/ncnn/yolort-opt.param b/deployment/ncnn/yolort-opt.param
new file mode 100644
index 00000000..5aaeb8a5
--- /dev/null
+++ b/deployment/ncnn/yolort-opt.param
@@ -0,0 +1,178 @@
+7767517
+176 200
+Input                    images                   0 1 images
+YOLOv5Focus              focus                    1 1 images 401
+Convolution              Conv_41                  1 1 401 731 0=32 1=3 4=1 5=1 6=3456
+Swish                    Mul_43                   1 1 731 405
+Convolution              Conv_44                  1 1 405 734 0=64 1=3 3=2 4=1 5=1 6=18432
+Swish                    Mul_46                   1 1 734 409
+Split                    splitncnn_0              1 2 409 409_splitncnn_0 409_splitncnn_1
+Convolution              Conv_47                  1 1 409_splitncnn_1 737 0=32 1=1 5=1 6=2048
+Swish                    Mul_49                   1 1 737 413
+Split                    splitncnn_1              1 2 413 413_splitncnn_0 413_splitncnn_1
+Convolution              Conv_50                  1 1 413_splitncnn_1 740 0=32 1=1 5=1 6=1024
+Swish                    Mul_52                   1 1 740 417
+Convolution              Conv_53                  1 1 417 743 0=32 1=3 4=1 5=1 6=9216
+Swish                    Mul_55                   1 1 743 421
+BinaryOp                 Add_56                   2 1 413_splitncnn_0 421 422
+Convolution              Conv_57                  1 1 409_splitncnn_0 746 0=32 1=1 5=1 6=2048
+Swish                    Mul_59                   1 1 746 426
+Concat                   Concat_60                2 1 422 426 427
+Convolution              Conv_61                  1 1 427 749 0=64 1=1 5=1 6=4096
+Swish                    Mul_63                   1 1 749 431
+Convolution              Conv_64                  1 1 431 752 0=128 1=3 3=2 4=1 5=1 6=73728
+Swish                    Mul_66                   1 1 752 435
+Split                    splitncnn_2              1 2 435 435_splitncnn_0 435_splitncnn_1
+Convolution              Conv_67                  1 1 435_splitncnn_1 755 0=64 1=1 5=1 6=8192
+Swish                    Mul_69                   1 1 755 439
+Split                    splitncnn_3              1 2 439 439_splitncnn_0 439_splitncnn_1
+Convolution              Conv_70                  1 1 439_splitncnn_1 758 0=64 1=1 5=1 6=4096
+Swish                    Mul_72                   1 1 758 443
+Convolution              Conv_73                  1 1 443 761 0=64 1=3 4=1 5=1 6=36864
+Swish                    Mul_75                   1 1 761 447
+BinaryOp                 Add_76                   2 1 439_splitncnn_0 447 448
+Split                    splitncnn_4              1 2 448 448_splitncnn_0 448_splitncnn_1
+Convolution              Conv_77                  1 1 448_splitncnn_1 764 0=64 1=1 5=1 6=4096
+Swish                    Mul_79                   1 1 764 452
+Convolution              Conv_80                  1 1 452 767 0=64 1=3 4=1 5=1 6=36864
+Swish                    Mul_82                   1 1 767 456
+BinaryOp                 Add_83                   2 1 448_splitncnn_0 456 457
+Split                    splitncnn_5              1 2 457 457_splitncnn_0 457_splitncnn_1
+Convolution              Conv_84                  1 1 457_splitncnn_1 770 0=64 1=1 5=1 6=4096
+Swish                    Mul_86                   1 1 770 461
+Convolution              Conv_87                  1 1 461 773 0=64 1=3 4=1 5=1 6=36864
+Swish                    Mul_89                   1 1 773 465
+BinaryOp                 Add_90                   2 1 457_splitncnn_0 465 466
+Convolution              Conv_91                  1 1 435_splitncnn_0 776 0=64 1=1 5=1 6=8192
+Swish                    Mul_93                   1 1 776 470
+Concat                   Concat_94                2 1 466 470 471
+Convolution              Conv_95                  1 1 471 779 0=128 1=1 5=1 6=16384
+Swish                    Mul_97                   1 1 779 475
+Split                    splitncnn_6              1 2 475 475_splitncnn_0 475_splitncnn_1
+Convolution              Conv_98                  1 1 475_splitncnn_1 782 0=256 1=3 3=2 4=1 5=1 6=294912
+Swish                    Mul_100                  1 1 782 479
+Split                    splitncnn_7              1 2 479 479_splitncnn_0 479_splitncnn_1
+Convolution              Conv_101                 1 1 479_splitncnn_1 785 0=128 1=1 5=1 6=32768
+Swish                    Mul_103                  1 1 785 483
+Split                    splitncnn_8              1 2 483 483_splitncnn_0 483_splitncnn_1
+Convolution              Conv_104                 1 1 483_splitncnn_1 788 0=128 1=1 5=1 6=16384
+Swish                    Mul_106                  1 1 788 487
+Convolution              Conv_107                 1 1 487 791 0=128 1=3 4=1 5=1 6=147456
+Swish                    Mul_109                  1 1 791 491
+BinaryOp                 Add_110                  2 1 483_splitncnn_0 491 492
+Split                    splitncnn_9              1 2 492 492_splitncnn_0 492_splitncnn_1
+Convolution              Conv_111                 1 1 492_splitncnn_1 794 0=128 1=1 5=1 6=16384
+Swish                    Mul_113                  1 1 794 496
+Convolution              Conv_114                 1 1 496 797 0=128 1=3 4=1 5=1 6=147456
+Swish                    Mul_116                  1 1 797 500
+BinaryOp                 Add_117                  2 1 492_splitncnn_0 500 501
+Split                    splitncnn_10             1 2 501 501_splitncnn_0 501_splitncnn_1
+Convolution              Conv_118                 1 1 501_splitncnn_1 800 0=128 1=1 5=1 6=16384
+Swish                    Mul_120                  1 1 800 505
+Convolution              Conv_121                 1 1 505 803 0=128 1=3 4=1 5=1 6=147456
+Swish                    Mul_123                  1 1 803 509
+BinaryOp                 Add_124                  2 1 501_splitncnn_0 509 510
+Convolution              Conv_125                 1 1 479_splitncnn_0 806 0=128 1=1 5=1 6=32768
+Swish                    Mul_127                  1 1 806 514
+Concat                   Concat_128               2 1 510 514 515
+Convolution              Conv_129                 1 1 515 809 0=256 1=1 5=1 6=65536
+Swish                    Mul_131                  1 1 809 519
+Split                    splitncnn_11             1 2 519 519_splitncnn_0 519_splitncnn_1
+Convolution              Conv_132                 1 1 519_splitncnn_1 812 0=512 1=3 3=2 4=1 5=1 6=1179648
+Swish                    Mul_134                  1 1 812 523
+Convolution              Conv_135                 1 1 523 815 0=256 1=1 5=1 6=131072
+Swish                    Mul_137                  1 1 815 527
+Split                    splitncnn_12             1 4 527 527_splitncnn_0 527_splitncnn_1 527_splitncnn_2 527_splitncnn_3
+Pooling                  MaxPool_138              1 1 527_splitncnn_3 528 1=5 3=2 5=1
+Pooling                  MaxPool_139              1 1 527_splitncnn_2 529 1=9 3=4 5=1
+Pooling                  MaxPool_140              1 1 527_splitncnn_1 530 1=13 3=6 5=1
+Concat                   Concat_141               4 1 527_splitncnn_0 528 529 530 531
+Convolution              Conv_142                 1 1 531 818 0=512 1=1 5=1 6=524288
+Swish                    Mul_144                  1 1 818 535
+Split                    splitncnn_13             1 2 535 535_splitncnn_0 535_splitncnn_1
+Convolution              Conv_145                 1 1 535_splitncnn_1 821 0=256 1=1 5=1 6=131072
+Swish                    Mul_147                  1 1 821 539
+Convolution              Conv_148                 1 1 539 824 0=256 1=1 5=1 6=65536
+Swish                    Mul_150                  1 1 824 543
+Convolution              Conv_151                 1 1 543 827 0=256 1=3 4=1 5=1 6=589824
+Swish                    Mul_153                  1 1 827 547
+Convolution              Conv_154                 1 1 535_splitncnn_0 830 0=256 1=1 5=1 6=131072
+Swish                    Mul_156                  1 1 830 551
+Concat                   Concat_157               2 1 547 551 552
+Convolution              Conv_158                 1 1 552 833 0=512 1=1 5=1 6=262144
+Swish                    Mul_160                  1 1 833 556
+Convolution              Conv_161                 1 1 556 836 0=256 1=1 5=1 6=131072
+Swish                    Mul_163                  1 1 836 560
+Split                    splitncnn_14             1 2 560 560_splitncnn_0 560_splitncnn_1
+Interp                   Resize_165               1 1 560_splitncnn_1 565 0=1 1=2.000000e+00 2=2.000000e+00
+Concat                   Concat_166               2 1 565 519_splitncnn_0 566
+Split                    splitncnn_15             1 2 566 566_splitncnn_0 566_splitncnn_1
+Convolution              Conv_167                 1 1 566_splitncnn_1 839 0=128 1=1 5=1 6=65536
+Swish                    Mul_169                  1 1 839 570
+Convolution              Conv_170                 1 1 570 842 0=128 1=1 5=1 6=16384
+Swish                    Mul_172                  1 1 842 574
+Convolution              Conv_173                 1 1 574 845 0=128 1=3 4=1 5=1 6=147456
+Swish                    Mul_175                  1 1 845 578
+Convolution              Conv_176                 1 1 566_splitncnn_0 848 0=128 1=1 5=1 6=65536
+Swish                    Mul_178                  1 1 848 582
+Concat                   Concat_179               2 1 578 582 583
+Convolution              Conv_180                 1 1 583 851 0=256 1=1 5=1 6=65536
+Swish                    Mul_182                  1 1 851 587
+Convolution              Conv_183                 1 1 587 854 0=128 1=1 5=1 6=32768
+Swish                    Mul_185                  1 1 854 591
+Split                    splitncnn_16             1 2 591 591_splitncnn_0 591_splitncnn_1
+Interp                   Resize_187               1 1 591_splitncnn_1 596 0=1 1=2.000000e+00 2=2.000000e+00
+Concat                   Concat_188               2 1 596 475_splitncnn_0 597
+Split                    splitncnn_17             1 2 597 597_splitncnn_0 597_splitncnn_1
+Convolution              Conv_189                 1 1 597_splitncnn_1 857 0=64 1=1 5=1 6=16384
+Swish                    Mul_191                  1 1 857 601
+Convolution              Conv_192                 1 1 601 860 0=64 1=1 5=1 6=4096
+Swish                    Mul_194                  1 1 860 605
+Convolution              Conv_195                 1 1 605 863 0=64 1=3 4=1 5=1 6=36864
+Swish                    Mul_197                  1 1 863 609
+Convolution              Conv_198                 1 1 597_splitncnn_0 866 0=64 1=1 5=1 6=16384
+Swish                    Mul_200                  1 1 866 613
+Concat                   Concat_201               2 1 609 613 614
+Convolution              Conv_202                 1 1 614 869 0=128 1=1 5=1 6=16384
+Swish                    Mul_204                  1 1 869 618
+Split                    splitncnn_18             1 2 618 618_splitncnn_0 618_splitncnn_1
+Convolution              Conv_205                 1 1 618_splitncnn_1 872 0=128 1=3 3=2 4=1 5=1 6=147456
+Swish                    Mul_207                  1 1 872 622
+Concat                   Concat_208               2 1 622 591_splitncnn_0 623
+Split                    splitncnn_19             1 2 623 623_splitncnn_0 623_splitncnn_1
+Convolution              Conv_209                 1 1 623_splitncnn_1 875 0=128 1=1 5=1 6=32768
+Swish                    Mul_211                  1 1 875 627
+Convolution              Conv_212                 1 1 627 878 0=128 1=1 5=1 6=16384
+Swish                    Mul_214                  1 1 878 631
+Convolution              Conv_215                 1 1 631 881 0=128 1=3 4=1 5=1 6=147456
+Swish                    Mul_217                  1 1 881 635
+Convolution              Conv_218                 1 1 623_splitncnn_0 884 0=128 1=1 5=1 6=32768
+Swish                    Mul_220                  1 1 884 639
+Concat                   Concat_221               2 1 635 639 640
+Convolution              Conv_222                 1 1 640 887 0=256 1=1 5=1 6=65536
+Swish                    Mul_224                  1 1 887 644
+Split                    splitncnn_20             1 2 644 644_splitncnn_0 644_splitncnn_1
+Convolution              Conv_225                 1 1 644_splitncnn_1 890 0=256 1=3 3=2 4=1 5=1 6=589824
+Swish                    Mul_227                  1 1 890 648
+Concat                   Concat_228               2 1 648 560_splitncnn_0 649
+Split                    splitncnn_21             1 2 649 649_splitncnn_0 649_splitncnn_1
+Convolution              Conv_229                 1 1 649_splitncnn_1 893 0=256 1=1 5=1 6=131072
+Swish                    Mul_231                  1 1 893 653
+Convolution              Conv_232                 1 1 653 896 0=256 1=1 5=1 6=65536
+Swish                    Mul_234                  1 1 896 657
+Convolution              Conv_235                 1 1 657 899 0=256 1=3 4=1 5=1 6=589824
+Swish                    Mul_237                  1 1 899 661
+Convolution              Conv_238                 1 1 649_splitncnn_0 902 0=256 1=1 5=1 6=131072
+Swish                    Mul_240                  1 1 902 665
+Concat                   Concat_241               2 1 661 665 666
+Convolution              Conv_242                 1 1 666 905 0=512 1=1 5=1 6=262144
+Swish                    Mul_244                  1 1 905 670
+Convolution              Conv_245                 1 1 618_splitncnn_0 671 0=255 1=1 5=1 6=32640
+Reshape                  Reshape_259              1 1 671 689 0=-1 1=85 2=3
+Permute                  Transpose_260            1 1 689 h1 0=1
+Convolution              Conv_261                 1 1 644_splitncnn_0 691 0=255 1=1 5=1 6=65280
+Reshape                  Reshape_275              1 1 691 709 0=-1 1=85 2=3
+Permute                  Transpose_276            1 1 709 h2 0=1
+Convolution              Conv_277                 1 1 670 711 0=255 1=1 5=1 6=130560
+Reshape                  Reshape_291              1 1 711 729 0=-1 1=85 2=3
+Permute                  Transpose_292            1 1 729 h3 0=1
diff --git a/test/test_models_common.py b/test/test_models_common.py
new file mode 100644
index 00000000..f263a7ee
--- /dev/null
+++ b/test/test_models_common.py
@@ -0,0 +1,11 @@
+import pytest
+import torch
+from yolort.models.common import focus_transform, space_to_depth
+
+
+@pytest.mark.parametrize('n, b, h, w', [(1, 3, 480, 640), (4, 3, 416, 320), (4, 3, 320, 416)])
+def test_space_to_depth(n, b, h, w):
+    tensor_input = torch.randn((n, b, h, w))
+    out1 = focus_transform(tensor_input)
+    out2 = space_to_depth(tensor_input)
+    torch.testing.assert_allclose(out1, out2)
diff --git a/yolort/models/common.py b/yolort/models/common.py
index c70f5383..4e37705a 100644
--- a/yolort/models/common.py
+++ b/yolort/models/common.py
@@ -170,6 +170,15 @@ def focus_transform(x: Tensor) -> Tensor:
     return y
 
 
+def space_to_depth(x: Tensor) -> Tensor:
+    '''x(b,c,w,h) -> y(b,4c,w/2,h/2)'''
+    N, C, H, W = x.size()
+    x = x.reshape(N, C, H // 2, 2, W // 2, 2)
+    x = x.permute(0, 5, 3, 1, 2, 4)
+    y = x.reshape(N, C * 4, H // 2, W // 2)
+    return y
+
+
 class Concat(nn.Module):
     # Concatenate a list of tensors along dimension
     def __init__(self, dimension: int = 1):