From 537c3b4e74eb1ed9b13657cac6b5dff6be92642d Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Mon, 1 Feb 2021 22:39:58 -0500
Subject: [PATCH 1/9] Supplement Licenses

---
 models/_utils.py                   | 3 ++-
 models/backbone_utils.py           | 1 +
 models/darknet.py                  | 3 +--
 models/path_aggregation_network.py | 2 +-
 models/pl_wrapper.py               | 3 +--
 utils/image_utils.py               | 1 +
 6 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/models/_utils.py b/models/_utils.py
index 82d4eec2..6b96bb7a 100644
--- a/models/_utils.py
+++ b/models/_utils.py
@@ -1,7 +1,8 @@
+# Modified from ultralytics/yolov5 by Zhiqiang Wang
 import math
 
 import torch
-from torch import nn, Tensor
+from torch import Tensor
 import torch.nn.functional as F
 from torchvision.ops import box_convert
 
diff --git a/models/backbone_utils.py b/models/backbone_utils.py
index d6637d0c..35a4ece8 100644
--- a/models/backbone_utils.py
+++ b/models/backbone_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021, Zhiqiang Wang. All Rights Reserved.
 from typing import List, Optional
 from torch import nn
 from torchvision.models._utils import IntermediateLayerGetter
diff --git a/models/darknet.py b/models/darknet.py
index dcca1528..21b828ed 100644
--- a/models/darknet.py
+++ b/models/darknet.py
@@ -1,8 +1,7 @@
+# Copyright (c) 2021, Zhiqiang Wang. All Rights Reserved.
 import torch
 from torch import nn, Tensor
 from torch.hub import load_state_dict_from_url
-from torch.nn.modules import conv
-from torch.nn.modules.linear import Linear
 
 from .common import Conv, SPP, Focus, BottleneckCSP
 from .experimental import C3
diff --git a/models/path_aggregation_network.py b/models/path_aggregation_network.py
index 4f44c956..5b88e45b 100644
--- a/models/path_aggregation_network.py
+++ b/models/path_aggregation_network.py
@@ -1,5 +1,5 @@
+# Copyright (c) 2021, Zhiqiang Wang. All Rights Reserved.
 import torch
-
 from torch import nn, Tensor
 
 from .common import Conv, BottleneckCSP
diff --git a/models/pl_wrapper.py b/models/pl_wrapper.py
index d07b69f9..a416b097 100644
--- a/models/pl_wrapper.py
+++ b/models/pl_wrapper.py
@@ -2,8 +2,7 @@
 import argparse
 
 import torch
-from torch import nn, Tensor
-from torchvision.models.utils import load_state_dict_from_url
+from torch import Tensor
 
 import pytorch_lightning as pl
 
diff --git a/utils/image_utils.py b/utils/image_utils.py
index f9c31afd..215e520d 100644
--- a/utils/image_utils.py
+++ b/utils/image_utils.py
@@ -255,6 +255,7 @@ def restore_anchor(anchor, grid_x, grid_y, stride, feature_map_size, image_sizes
 
 
 def anchor_match_visualize(images, targets, indices, anchors, pred):
+    # Modified from <https://github.com/hhaAndroid/yolov5-comment/blob/e018889b/utils/general.py#L714>
     image_sizes = images.shape[-2:]
     images = parse_images(images)
 

From 99a013838468dfab6cf4356244efca07e1261922 Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 05:29:30 -0500
Subject: [PATCH 2/9] Add essential scripts for coords rescaling

---
 models/pl_wrapper.py |  12 ++-
 models/transform.py  | 178 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 184 insertions(+), 6 deletions(-)

diff --git a/models/pl_wrapper.py b/models/pl_wrapper.py
index a416b097..ef4f3c47 100644
--- a/models/pl_wrapper.py
+++ b/models/pl_wrapper.py
@@ -7,7 +7,7 @@
 import pytorch_lightning as pl
 
 from . import yolo
-from .transform import nested_tensor_from_tensor_list
+from .transform import GeneralizedYOLOTransform, nested_tensor_from_tensor_list
 
 from typing import Any, List, Optional
 
@@ -23,6 +23,8 @@ def __init__(
         pretrained: bool = False,
         progress: bool = True,
         num_classes: int = 80,
+        min_size: int = 320,
+        max_size: int = 416,
         **kwargs: Any,
     ):
         """
@@ -40,9 +42,13 @@ def __init__(
         self.model = yolo.__dict__[arch](
             pretrained=pretrained, progress=progress, num_classes=num_classes, **kwargs)
 
+        self.transform = GeneralizedYOLOTransform(min_size, max_size)
+
     def forward(self, inputs: List[Tensor], targets: Optional[Tensor] = None):
-        sample = nested_tensor_from_tensor_list(inputs)
-        return self.model(sample.tensors, targets=targets)
+        samples, targets = self.transform(inputs, targets)
+        detections = self.model(samples.tensors, targets=targets)
+        detections = self.transform.postprocess(detections, samples.image_sizes, original_image_sizes)
+        return detections
 
     def training_step(self, batch, batch_idx):
 
diff --git a/models/transform.py b/models/transform.py
index 490e47fb..a952c7b2 100644
--- a/models/transform.py
+++ b/models/transform.py
@@ -3,9 +3,11 @@
 import math
 import torch
 from torch import nn, Tensor
+import torch.nn.functional as F
+
 import torchvision
 
-from typing import Optional, List
+from typing import Dict, Optional, List, Tuple
 
 
 class NestedTensor(object):
@@ -15,17 +17,115 @@ class NestedTensor(object):
     This works by padding the images to the same size,
     and storing in a field the original sizes of each image
     """
-    def __init__(self, tensors):
+    def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Args:
+            tensors (Tensor)
+            image_sizes (list[tuple[int, int]])
+        """
         self.tensors = tensors
+        self.image_sizes = image_sizes
 
     def to(self, device) -> "NestedTensor":
         cast_tensor = self.tensors.to(device)
-        return NestedTensor(cast_tensor)
+        return NestedTensor(cast_tensor, self.image_sizes)
 
     def __repr__(self):
         return str(self.tensors)
 
 
+class GeneralizedYOLOTransform(nn.Module):
+    def __init__(self, min_size, max_size) -> None:
+        super().__init__()
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def forward(
+        self,
+        images: List[Tensor],
+        targets: Optional[List[Dict[str, Tensor]]],
+    ) -> Tuple[NestedTensor, Optional[List[Dict[str, Tensor]]]]:
+        images = [img for img in images]
+        if targets is not None:
+            # make a copy of targets to avoid modifying it in-place
+            # once torchscript supports dict comprehension
+            # this can be simplified as as follows
+            # targets = [{k: v for k,v in t.items()} for t in targets]
+            targets_copy: List[Dict[str, Tensor]] = []
+            for t in targets:
+                data: Dict[str, Tensor] = {}
+                for k, v in t.items():
+                    data[k] = v
+                targets_copy.append(data)
+            targets = targets_copy
+
+        for i in range(len(images)):
+            image = images[i]
+            target_index = targets[i] if targets is not None else None
+
+            if image.dim() != 3:
+                raise ValueError("images is expected to be a list of 3d tensors "
+                                 "of shape [C, H, W], got {}".format(image.shape))
+
+            image, target_index = self.resize(image, target_index)
+            images[i] = image
+            if targets is not None and target_index is not None:
+                targets[i] = target_index
+
+        image_sizes = [img.shape[-2:] for img in images]
+        images = self.nested_tensor_from_tensor_list(images)
+        image_sizes_list: List[Tuple[int, int]] = []
+        for image_size in image_sizes:
+            assert len(image_size) == 2
+            image_sizes_list.append((image_size[0], image_size[1]))
+
+        image_list = NestedTensor(images, image_sizes_list)
+        return image_list, targets
+
+    def resize(
+        self,
+        image: Tensor,
+        target: Optional[Dict[str, Tensor]],
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+
+        h, w = image.shape[-2:]
+        if self.training:
+            size = float(self.torch_choice(self.min_size))
+        else:
+            # FIXME assume for now that testing uses the largest scale
+            size = float(self.min_size[-1])
+        if torchvision._is_tracing():
+            image, target = _resize_image_and_masks_onnx(image, size, float(self.max_size), target)
+        else:
+            image, target = _resize_image_and_masks(image, size, float(self.max_size), target)
+
+        if target is None:
+            return image, target
+
+        bbox = target["boxes"]
+        bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
+        target["boxes"] = bbox
+
+        return image, target
+
+    def postprocess(
+        self,
+        result: List[Dict[str, Tensor]],
+        image_shapes: List[Tuple[int, int]],
+        original_image_sizes: List[Tuple[int, int]],
+    ) -> List[Dict[str, Tensor]]:
+        if self.training:
+            return result
+        for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
+            boxes = pred["boxes"]
+            boxes = resize_boxes(boxes, im_s, o_im_s)
+            result[i]["boxes"] = boxes
+
+        return result
+
+
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32):
     # TODO make this more general
     if tensor_list[0].ndim == 3:
@@ -84,3 +184,75 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisib
     tensor = torch.stack(padded_imgs)
 
     return NestedTensor(tensor)
+
+
+@torch.jit.unused
+def _resize_image_and_masks_onnx(
+    image: Tensor,
+    self_min_size: float,
+    self_max_size: float,
+    target: Optional[Dict[str, Tensor]],
+) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+
+    from torch.onnx import operators
+
+    im_shape = operators.shape_as_tensor(image)[-2:]
+    min_size = torch.min(im_shape).to(dtype=torch.float32)
+    max_size = torch.max(im_shape).to(dtype=torch.float32)
+    scale_factor = torch.min(self_min_size / min_size, self_max_size / max_size)
+
+    image = torch.nn.functional.interpolate(
+        image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True,
+        align_corners=False)[0]
+
+    if target is None:
+        return image, target
+
+    if "masks" in target:
+        mask = target["masks"]
+        mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor, recompute_scale_factor=True)[:, 0].byte()
+        target["masks"] = mask
+    return image, target
+
+
+def _resize_image_and_masks(
+    image: Tensor,
+    self_min_size: float,
+    self_max_size: float,
+    target: Optional[Dict[str, Tensor]],
+) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+
+    im_shape = torch.tensor(image.shape[-2:])
+    min_size = float(torch.min(im_shape))
+    max_size = float(torch.max(im_shape))
+    scale_factor = self_min_size / min_size
+    if max_size * scale_factor > self_max_size:
+        scale_factor = self_max_size / max_size
+    image = torch.nn.functional.interpolate(
+        image[None], scale_factor=scale_factor, mode='bilinear', recompute_scale_factor=True,
+        align_corners=False)[0]
+
+    if target is None:
+        return image, target
+
+    if "masks" in target:
+        mask = target["masks"]
+        mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor, recompute_scale_factor=True)[:, 0].byte()
+        target["masks"] = mask
+    return image, target
+
+
+def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=boxes.device) /
+        torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratio_height, ratio_width = ratios
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+
+    xmin = xmin * ratio_width
+    xmax = xmax * ratio_width
+    ymin = ymin * ratio_height
+    ymax = ymax * ratio_height
+    return torch.stack((xmin, ymin, xmax, ymax), dim=1)

From 0f40d89b698193a62bdf46c28e5ede1472b79a5f Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 07:57:32 -0500
Subject: [PATCH 3/9] Fixing bugs in GeneralizedYOLOTransform

---
 models/pl_wrapper.py | 30 +++++++++++++++++++++++++--
 models/transform.py  | 49 ++++++++++++++++++++++++++++++++++++--------
 models/yolo.py       |  1 +
 3 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/models/pl_wrapper.py b/models/pl_wrapper.py
index ef4f3c47..d860cfb1 100644
--- a/models/pl_wrapper.py
+++ b/models/pl_wrapper.py
@@ -9,7 +9,7 @@
 from . import yolo
 from .transform import GeneralizedYOLOTransform, nested_tensor_from_tensor_list
 
-from typing import Any, List, Optional
+from typing import Any, List, Dict, Tuple, Optional
 
 
 class YOLOLitWrapper(pl.LightningModule):
@@ -44,9 +44,35 @@ def __init__(
 
         self.transform = GeneralizedYOLOTransform(min_size, max_size)
 
-    def forward(self, inputs: List[Tensor], targets: Optional[Tensor] = None):
+    def forward(
+        self,
+        inputs: List[Tensor],
+        targets: Optional[List[Dict[str, Tensor]]] = None,
+    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+        """
+        Args:
+            images (list[Tensor]): images to be processed
+            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        # get the original image sizes
+        original_image_sizes: List[Tuple[int, int]] = []
+        for img in inputs:
+            val = img.shape[-2:]
+            assert len(val) == 2
+            original_image_sizes.append((val[0], val[1]))
+
+        # Transform the input
         samples, targets = self.transform(inputs, targets)
+        # Compute the detections
         detections = self.model(samples.tensors, targets=targets)
+        # Rescale coordinate
         detections = self.transform.postprocess(detections, samples.image_sizes, original_image_sizes)
         return detections
 
diff --git a/models/transform.py b/models/transform.py
index a952c7b2..ee41c9dc 100644
--- a/models/transform.py
+++ b/models/transform.py
@@ -35,6 +35,16 @@ def __repr__(self):
 
 
 class GeneralizedYOLOTransform(nn.Module):
+    """
+    Performs input / target transformation before feeding the data to a GeneralizedRCNN
+    model.
+
+    The transformations it perform are:
+        - input normalization (mean subtraction and std division)
+        - input / target resizing to match min_size / max_size
+
+    It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
+    """
     def __init__(self, min_size, max_size) -> None:
         super().__init__()
         if not isinstance(min_size, (list, tuple)):
@@ -46,7 +56,8 @@ def forward(
         self,
         images: List[Tensor],
         targets: Optional[List[Dict[str, Tensor]]],
-    ) -> Tuple[NestedTensor, Optional[List[Dict[str, Tensor]]]]:
+    ) -> Tuple[NestedTensor, Optional[Tensor]]:
+
         images = [img for img in images]
         if targets is not None:
             # make a copy of targets to avoid modifying it in-place
@@ -75,14 +86,37 @@ def forward(
                 targets[i] = target_index
 
         image_sizes = [img.shape[-2:] for img in images]
-        images = self.nested_tensor_from_tensor_list(images)
+        images = nested_tensor_from_tensor_list(images)
         image_sizes_list: List[Tuple[int, int]] = []
         for image_size in image_sizes:
             assert len(image_size) == 2
             image_sizes_list.append((image_size[0], image_size[1]))
 
         image_list = NestedTensor(images, image_sizes_list)
-        return image_list, targets
+
+        if targets is not None:
+            targets_batched = []
+            for i, target in enumerate(targets):
+                num_objects = len(target['labels'])
+                if num_objects > 0:
+                    targets_merged = torch.full((num_objects, 6), i, dtype=torch.float32)
+                    targets_merged[:, 1] = target['labels']
+                    targets_merged[:, 2:] = target['boxes']
+                    targets_batched.append(targets_merged)
+            targets_batched = torch.cat(targets_batched, dim=0)
+        else:
+            targets_batched = None
+
+        return image_list, targets_batched
+
+    def torch_choice(self, k: List[int]) -> int:
+        """
+        Implements `random.choice` via torch ops so it can be compiled with
+        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
+        is fixed.
+        """
+        index = int(torch.empty(1).uniform_(0., float(len(k))).item())
+        return k[index]
 
     def resize(
         self,
@@ -116,8 +150,7 @@ def postprocess(
         image_shapes: List[Tuple[int, int]],
         original_image_sizes: List[Tuple[int, int]],
     ) -> List[Dict[str, Tensor]]:
-        if self.training:
-            return result
+
         for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
             boxes = pred["boxes"]
             boxes = resize_boxes(boxes, im_s, o_im_s)
@@ -146,7 +179,7 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: in
             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
     else:
         raise ValueError('not supported')
-    return NestedTensor(tensor_batched)
+    return tensor_batched
 
 
 def _max_by_axis(the_list: List[List[int]]) -> List[int]:
@@ -160,7 +193,7 @@ def _max_by_axis(the_list: List[List[int]]) -> List[int]:
 # _onnx_nested_tensor_from_tensor_list() is an implementation of
 # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 @torch.jit.unused
-def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32) -> NestedTensor:
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32) -> Tensor:
     max_size = []
     for i in range(tensor_list[0].dim()):
         max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
@@ -183,7 +216,7 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisib
 
     tensor = torch.stack(padded_imgs)
 
-    return NestedTensor(tensor)
+    return tensor
 
 
 @torch.jit.unused
diff --git a/models/yolo.py b/models/yolo.py
index 85039480..d54b62cc 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -88,6 +88,7 @@ def forward(
             samples (NestedTensor): Expects a NestedTensor, which consists of:
                - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
             targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
+
         Returns:
             result (list[BoxList] or dict[Tensor]): the output from the model.
                 During training, it returns a dict[Tensor] which contains the losses.

From e94abbde166b0dcaee398636cc636b9474b7ea79 Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 11:54:25 -0500
Subject: [PATCH 4/9] Fix torch jit tracing

---
 models/pl_wrapper.py |  2 +-
 models/transform.py  | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/models/pl_wrapper.py b/models/pl_wrapper.py
index d860cfb1..b9fcf4d6 100644
--- a/models/pl_wrapper.py
+++ b/models/pl_wrapper.py
@@ -48,7 +48,7 @@ def forward(
         self,
         inputs: List[Tensor],
         targets: Optional[List[Dict[str, Tensor]]] = None,
-    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
+    ) -> List[Dict[str, Tensor]]:
         """
         Args:
             images (list[Tensor]): images to be processed
diff --git a/models/transform.py b/models/transform.py
index ee41c9dc..c9ad13f7 100644
--- a/models/transform.py
+++ b/models/transform.py
@@ -146,17 +146,22 @@ def resize(
 
     def postprocess(
         self,
-        result: List[Dict[str, Tensor]],
+        result: Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]],
         image_shapes: List[Tuple[int, int]],
         original_image_sizes: List[Tuple[int, int]],
     ) -> List[Dict[str, Tensor]]:
 
-        for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
+        if torch.jit.is_scripting():
+            predictions = result[1]
+        else:
+            predictions = result
+
+        for i, (pred, im_s, o_im_s) in enumerate(zip(predictions, image_shapes, original_image_sizes)):
             boxes = pred["boxes"]
             boxes = resize_boxes(boxes, im_s, o_im_s)
-            result[i]["boxes"] = boxes
+            predictions[i]["boxes"] = boxes
 
-        return result
+        return predictions
 
 
 def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32):

From 1b0145309093c9a94209e6f2683fe0c577fb4ceb Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 12:04:24 -0500
Subject: [PATCH 5/9] Fix bugs in torchscript

---
 test/test_torchscript.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_torchscript.py b/test/test_torchscript.py
index 97d0f9eb..3d83897f 100644
--- a/test/test_torchscript.py
+++ b/test/test_torchscript.py
@@ -16,7 +16,7 @@ def test_yolov5s_script(self):
         x = [torch.rand(3, 416, 320), torch.rand(3, 480, 352)]
 
         out = model(x)
-        out_script = scripted_model(x)[1]
+        out_script = scripted_model(x)
         self.assertTrue(out[0]["scores"].equal(out_script[0]["scores"]))
         self.assertTrue(out[0]["labels"].equal(out_script[0]["labels"]))
         self.assertTrue(out[0]["boxes"].equal(out_script[0]["boxes"]))
@@ -31,7 +31,7 @@ def test_yolov5m_script(self):
         x = [torch.rand(3, 416, 320), torch.rand(3, 480, 352)]
 
         out = model(x)
-        out_script = scripted_model(x)[1]
+        out_script = scripted_model(x)
         self.assertTrue(out[0]["scores"].equal(out_script[0]["scores"]))
         self.assertTrue(out[0]["labels"].equal(out_script[0]["labels"]))
         self.assertTrue(out[0]["boxes"].equal(out_script[0]["boxes"]))
@@ -46,7 +46,7 @@ def test_yolov5l_script(self):
         x = [torch.rand(3, 416, 320), torch.rand(3, 480, 352)]
 
         out = model(x)
-        out_script = scripted_model(x)[1]
+        out_script = scripted_model(x)
         self.assertTrue(out[0]["scores"].equal(out_script[0]["scores"]))
         self.assertTrue(out[0]["labels"].equal(out_script[0]["labels"]))
         self.assertTrue(out[0]["boxes"].equal(out_script[0]["boxes"]))

From 2f64c397dde213db935b9b1d74c7aa77d9a37287 Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 12:21:05 -0500
Subject: [PATCH 6/9] The training mechanism has been changed

---
 models/pl_wrapper.py |  1 +
 test/test_engine.py  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/models/pl_wrapper.py b/models/pl_wrapper.py
index b9fcf4d6..bfd16a3b 100644
--- a/models/pl_wrapper.py
+++ b/models/pl_wrapper.py
@@ -74,6 +74,7 @@ def forward(
         detections = self.model(samples.tensors, targets=targets)
         # Rescale coordinate
         detections = self.transform.postprocess(detections, samples.image_sizes, original_image_sizes)
+
         return detections
 
     def training_step(self, batch, batch_idx):
diff --git a/test/test_engine.py b/test/test_engine.py
index 3f7d17e7..df8c613f 100644
--- a/test/test_engine.py
+++ b/test/test_engine.py
@@ -6,6 +6,8 @@
 from .dataset_utils import create_loaders, DummyDetectionDataset
 
 from models import YOLOLitWrapper
+from models.yolo import yolov5_darknet_pan_s_r31
+from models.transform import nested_tensor_from_tensor_list
 
 from typing import Dict
 
@@ -17,15 +19,16 @@ def test_train(self):
         img_name = "test/assets/zidane.jpg"
         img_tensor = image_preprocess(img_name)
         self.assertEqual(img_tensor.ndim, 3)
+        # Add a dummy image to train
         img_dummy = torch.rand((3, 416, 360), dtype=torch.float32)
 
-        images = [img_tensor, img_dummy]
+        images = nested_tensor_from_tensor_list([img_tensor, img_dummy])
         targets = torch.tensor([[0, 7, 0.3790, 0.5487, 0.3220, 0.2047],
                                 [0, 2, 0.2680, 0.5386, 0.2200, 0.1779],
                                 [0, 3, 0.1720, 0.5403, 0.1960, 0.1409],
                                 [0, 4, 0.2240, 0.4547, 0.1520, 0.0705]], dtype=torch.float)
 
-        model = YOLOLitWrapper(num_classes=12)
+        model = yolov5_darknet_pan_s_r31(num_classes=12)
         model.train()
         out = model(images, targets)
         self.assertIsInstance(out, Dict)
@@ -33,10 +36,15 @@ def test_train(self):
         self.assertIsInstance(out["bbox_regression"], torch.Tensor)
         self.assertIsInstance(out["objectness"], torch.Tensor)
 
+    @unittest.skip("Current it isn't well implemented")
     def test_train_one_step(self):
+        # Determine the device
+        gpu_available = torch.cuda.is_available()
+        device = torch.device("cuda") if gpu_available else torch.device("cpu")
         # Load model
         model = YOLOLitWrapper()
         model.train()
+        model = model.to(device)
 
         # Datasets
         datasets = DummyDetectionDataset(num_samples=200)

From 2e6bf5f1a0e6ab9ac941d84c7bf5a0b43066a87c Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 12:30:14 -0500
Subject: [PATCH 7/9] Bug fix in libtorch unittest

---
 test/tracing/test_tracing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/tracing/test_tracing.cpp b/test/tracing/test_tracing.cpp
index ace361bc..29622936 100644
--- a/test/tracing/test_tracing.cpp
+++ b/test/tracing/test_tracing.cpp
@@ -34,7 +34,7 @@ int main() {
   inputs.push_back(images);
   auto output = module.forward(inputs);
 
-  auto detections = output.toTuple()->elements()[1];
+  auto detections = output.toTuple()->elements();
 
   std::cout << ">> OKey, detections: " << detections << std::endl;
 
@@ -53,7 +53,7 @@ int main() {
     inputs.push_back(images);
     auto output = module.forward(inputs);
 
-    auto detections = output.toTuple()->elements()[1];
+    auto detections = output.toTuple()->elements();
 
     std::cout << ">> OKey, detections: " << detections << std::endl;
   }

From 7647104e1ef9ba42af372140bf960c051d37d3b1 Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 12:44:20 -0500
Subject: [PATCH 8/9] Currently ignore the PyTorchLightning training

---
 test/test_engine.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/test_engine.py b/test/test_engine.py
index df8c613f..0d3e21bb 100644
--- a/test/test_engine.py
+++ b/test/test_engine.py
@@ -38,13 +38,9 @@ def test_train(self):
 
     @unittest.skip("Current it isn't well implemented")
     def test_train_one_step(self):
-        # Determine the device
-        gpu_available = torch.cuda.is_available()
-        device = torch.device("cuda") if gpu_available else torch.device("cpu")
         # Load model
         model = YOLOLitWrapper()
         model.train()
-        model = model.to(device)
 
         # Datasets
         datasets = DummyDetectionDataset(num_samples=200)

From 1ac7a600f493607adecac820c69a9756cca8c649 Mon Sep 17 00:00:00 2001
From: zhiqwang <zhiqwang@outlook.com>
Date: Tue, 2 Feb 2021 12:47:45 -0500
Subject: [PATCH 9/9] Bug fix in libtorch unittest

---
 test/tracing/test_tracing.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/tracing/test_tracing.cpp b/test/tracing/test_tracing.cpp
index 29622936..1aa069a4 100644
--- a/test/tracing/test_tracing.cpp
+++ b/test/tracing/test_tracing.cpp
@@ -32,9 +32,7 @@ int main() {
   images.push_back(torch::rand({3, 480, 384}));
 
   inputs.push_back(images);
-  auto output = module.forward(inputs);
-
-  auto detections = output.toTuple()->elements();
+  auto detections = module.forward(inputs);
 
   std::cout << ">> OKey, detections: " << detections << std::endl;
 
@@ -51,9 +49,7 @@ int main() {
     images.push_back(torch::rand({3, 480, 384}, options));
 
     inputs.push_back(images);
-    auto output = module.forward(inputs);
-
-    auto detections = output.toTuple()->elements();
+    auto detections = module.forward(inputs);
 
     std::cout << ">> OKey, detections: " << detections << std::endl;
   }