CoinCheung · CoinCheung · Dec 2, 2022 · Dec 2, 2022
diff --git a/ncnn/segment.cpp b/ncnn/segment.cpp
@@ -51,14 +51,17 @@ void inference() {
     mod.opt.use_vulkan_compute = 1;
     mod.set_vulkan_device(1);
 #endif 
-    mod.load_param(mod_param.c_str());
-    mod.load_model(mod_model.c_str());
     // ncnn enable fp16 by default, so we do not need these options
     // int8 depends on the model itself, so we do not set here
     // bool use_fp16 = false;
     // mod.opt.use_fp16_packed = use_fp16;
     // mod.opt.use_fp16_storage = use_fp16;
     // mod.opt.use_fp16_arithmetic = use_fp16;
+    mod.opt.use_winograd_convolution = true;
+
+    // we should set opt before load model
+    mod.load_param(mod_param.c_str());
+    mod.load_model(mod_model.c_str());
 
     // load image, and copy to ncnn mat
     cv::Mat im = cv::imread(impth);

diff --git a/tensorrt/README.md b/tensorrt/README.md
@@ -20,7 +20,7 @@ Then we can use either c++ or python to compile the model and run inference.
 * ubuntu 18.04
 * nvidia Tesla T4 gpu, driver newer than 450.80
 * cuda 11.3, cudnn 8
-* cmake 3.17.1
+* cmake 3.22.0
 * opencv built from source
 * tensorrt 8.2.5.1
 
@@ -49,7 +49,7 @@ $ ./segment compile /path/to/onnx.model /path/to/saved_model.trt --fp16
 ```
 Building an int8 engine is also supported. Firstly, you should make sure your gpu support int8 inference, or you model will not be faster than fp16/fp32. Then you should prepare certain amount of images for int8 calibration. In this example, I use train set of cityscapes for calibration. The command is like this:  
 ```
-$ calibrate_int8 # delete this if exists
+$ rm calibrate_int8 # delete this if exists
 $ ./segment compile /path/to/onnx.model /path/to/saved_model.trt --int8 /path/to/BiSeNet/datasets/cityscapes /path/to/BiSeNet/datasets/cityscapes/train.txt
 ```
 With the above commands, we will have an tensorrt engine named `saved_model.trt` generated.  

diff --git a/tools/train_amp.py b/tools/train_amp.py
@@ -8,6 +8,7 @@
 import random
 import logging
 import time
+import json
 import argparse
 import numpy as np
 from tabulate import tabulate
@@ -55,7 +56,10 @@ def set_model(lb_ignore=255):
     net = model_factory[cfg.model_type](cfg.n_cats)
     if not args.finetune_from is None:
         logger.info(f'load pretrained weights from {args.finetune_from}')
-        net.load_state_dict(torch.load(args.finetune_from, map_location='cpu'))
+        msg = net.load_state_dict(torch.load(args.finetune_from,
+            map_location='cpu'), strict=False)
+        logger.info('\tmissing keys: ' + json.dumps(msg.missing_keys))
+        logger.info('\tunexpected keys: ' + json.dumps(msg.unexpected_keys))
     if cfg.use_sync_bn: net = nn.SyncBatchNorm.convert_sync_batchnorm(net)
     net.cuda()
     net.train()