Fixed issue when converting but not saving bbox formats

- allowed changing of State.bbox under some circumstances - added some better explanations to the pose_similarity.py functions
bmmtstb · Jul 17, 2024 · 0450085 · 0450085
1 parent d04e2e6
commit 0450085
Show file tree

Hide file tree

Showing 10 changed files with 61 additions and 37 deletions.
diff --git a/dgs/default_values.yaml b/dgs/default_values.yaml
@@ -58,7 +58,7 @@ submission:
     file: "./submission.json"
 
     MOT:
-        bbox_decimals: 0
+        bbox_decimals: 3
         seqinfo_key: "Sequence"
 
 # ############## #

diff --git a/dgs/models/dataset/keypoint_rcnn.py b/dgs/models/dataset/keypoint_rcnn.py
@@ -137,7 +137,7 @@ def images_to_states(self, images: Images) -> list[State]:
         # make sure all images are float
         images = [tvte.Image(to_dtype(img, dtype=t.float32, scale=True)) for img in images]
 
-        # predicts a list of {boxes: XYWH[N], labels: Int64[N], scores: [N], keypoints: Float[N,J,(x|y|vis)]}
+        # predicts a list of {boxes: XYXY[N], labels: Int64[N], scores: [N], keypoints: Float[N,J,(x|y|vis)]}
         # every image in images can have multiple predictions
         outputs: list[dict[str, t.Tensor]] = self.model(images)
 

diff --git a/dgs/models/dataset/posetrack21.py b/dgs/models/dataset/posetrack21.py
@@ -723,7 +723,7 @@ def _get_anno_data(
             return (
                 t.empty((0, 17, 2)),
                 t.empty((0, 17, 1)),
-                tvte.BoundingBoxes(t.empty((0, 4)), canvas_size=(0, 0), format="XYXY"),
+                tvte.BoundingBoxes(t.empty((0, 4)), canvas_size=(0, 0), format="XYWH"),
                 (),
             )
         return collate_tensors(keypoints), collate_tensors(visibilities), collate_bboxes(bboxes), tuple(crop_paths)

diff --git a/dgs/models/similarity/pose_similarity.py b/dgs/models/similarity/pose_similarity.py
@@ -27,7 +27,7 @@
 
 
 class ObjectKeypointSimilarity(SimilarityModule):
-    """Compute the object key-point similarity (OKS) between two batches of poses.
+    """Compute the object key-point similarity (OKS) between two batches of poses / States.
 
     Params
     ------
@@ -57,7 +57,7 @@ def __init__(self, config: Config, path: NodePath):
             "eps", torch.tensor(torch.finfo(self.precision).eps, device=self.device, dtype=self.precision)
         )
         # Set up a transform function to convert the bounding boxes if they have the wrong format
-        self.transform = ConvertBoundingBoxFormat("XYXY")
+        self.transf_bbox_to_xyxy = ConvertBoundingBoxFormat("XYXY")
 
         # Set up softmax function if requested
         self.softmax = nn.Sequential()
@@ -81,7 +81,7 @@ def get_data(self, ds: State) -> tuple[torch.Tensor, torch.Tensor]:
         elif bboxes.format == BoundingBoxFormat.XYWH:
             area = bboxes[:, -2] * bboxes[:, -1]  # w * h
         else:
-            bboxes = self.transform(bboxes)
+            bboxes = self.transf_bbox_to_xyxy(bboxes)
             area = box_area(bboxes).float()
 
         return kps, area
@@ -121,36 +121,39 @@ def forward(self, data: State, target: State) -> torch.Tensor:
                 * 2 = labeled but visible
             * :math:`s` the scale of the ground truth object, with :math:`s^2` becoming the object's segmented area
 
-        Fixme: exclude ignore regions from image_shape ?
-
         Args:
-            data: A :class:`State` object containing at least the key points and the bounding box.
-            target: A :class:`State` containing at least the target key points.
+            data: A :class:`State` object containing at least the key points and the bounding box. Shape ``N``.
+            target: A :class:`State` containing at least the target key points. Shape ``T``.
+
+        Returns:
+            A (Float)Tensor of shape ``[N x T]`` with values in ``[0..1]``.
+            If requested, the softmax is computed along the -1 dimension,
+            resulting in probability distributions for each value of the input data.
         """
-        # get predicted key-points as [B1 x J x 2] and bbox area as [B1]
+        # get predicted key-points as [N x J x 2] and bbox area as [N]
         pred_kps, bbox_area = self.get_data(ds=data)
-        # get ground-truth key-points as [B2 x J x 2] and visibility as [B2 x J]
+        # get ground-truth key-points as [T x J x 2] and visibility as [T x J]
         gt_kps, gt_vis = self.get_target(ds=target)
         assert pred_kps.size(-1) == gt_kps.size(-1), "Key-points should have the same number of dimensions"
         # Compute d = Euclidean dist, but don't compute the sqrt, because only d^2 is required.
-        # A little tensor magic, because if B1 != B2 and B1 != 1 and B2 != 1, regular subtraction will fail!
-        # Therefore, modify the tensors to have shape [B1 x J x 2 x 1], [(1 x) J x 2 x B2].
-        # The output has shape [B1 x J x 2 x B2], then square and sum over the number of dimensions (-2).
+        # A little tensor magic, because if N != T and N != 1 and T != 1, regular subtraction will fail!
+        # Therefore, modify the tensors to have shape [N x J x 2 x 1], [(1 x) J x 2 x T].
+        # The output has shape [N x J x 2 x T], then square and sum over the number of dimensions (-2).
         d2 = torch.sum(
             torch.sub(pred_kps.unsqueeze(-1), gt_kps.permute(1, 2, 0)).square(),
             dim=-2,
-        )  # -> [B1 x J x B2]
+        )  # -> [N x J x T]
         # Ground truth scale as bounding box area in relation to the image area it lies within.
         # Keep area s^2, because s is never used.
-        s2 = bbox_area.flatten()  # [B1]
+        s2 = bbox_area.flatten()  # [N]
         # Keypoint similarity for every key-point pair of ground truth and detected.
-        # Use outer product to combine s^2 [B1] with k^2 [J] and add epsilon to make sure to have non-zero values.
+        # Use outer product to combine s^2 [N] with k^2 [J] and add epsilon to make sure to have non-zero values.
         # Again, modify the tensor shapes to match for division.
-        # Shapes: d2 [B1 x J x B2], new_outer [B1 x J x 1]
-        ks = torch.exp(-torch.div(d2, (2 * torch.outer(s2, self.k2) + self.eps).unsqueeze(-1)))  # -> [B1 x J x B2]
+        # Shapes: d2 [N x J x T], new_outer [N x J x 1]
+        ks = torch.exp(-torch.div(d2, (2 * torch.outer(s2, self.k2) + self.eps).unsqueeze(-1)))  # -> [N x J x T]
         # The count of non-zero visibilities in the ground-truth
-        count = torch.count_nonzero(gt_vis, dim=-1)  # [B2]
-        # for every pair in B, sum over all J
+        count = torch.count_nonzero(gt_vis, dim=-1)  # [T]
+        # with ks [N x J x T], sum over all J and divide by the nof visibilities
         return self.softmax(torch.div(torch.where(gt_vis.T, ks, 0).sum(dim=-2), count).nan_to_num_(nan=0.0, posinf=0.0))
 
 
@@ -178,7 +181,7 @@ def __init__(self, config: Config, path: NodePath):
 
     def get_data(self, ds: State) -> BoundingBoxes:
         """Given a :class:`State` obtain the ground-truth bounding-boxes as
-        :class:`torchvision.tv_tensors.BoundingBoxes` object of size ``[B1 x 4]``.
+        :class:`torchvision.tv_tensors.BoundingBoxes` object of size ``[N x 4]``.
 
         Notes:
             The box_iou function expects that the bounding boxes are in the 'XYXY' format.
@@ -190,7 +193,7 @@ def get_data(self, ds: State) -> BoundingBoxes:
 
     def get_target(self, ds: State) -> BoundingBoxes:
         """Given a :class:`State` obtain the ground-truth bounding-boxes as
-        :class:`torchvision.tv_etnsors.BoundingBoxes` object of size ``[B2 x 4]``.
+        :class:`torchvision.tv_etnsors.BoundingBoxes` object of size ``[T x 4]``.
 
         Notes:
             The function :func:`box_iou` expects that the bounding boxes are in the 'XYXY' format.
@@ -201,4 +204,15 @@ def get_target(self, ds: State) -> BoundingBoxes:
         return bboxes
 
     def forward(self, data: State, target: State) -> torch.Tensor:
+        """Given two states containing bounding-boxes, compute the intersection over union between each pair.
+
+        Args:
+            data: A :class:`State` object containing the detected bounding-boxes. Size ``N``
+            target: A :class:`State` object containing the target bounding-boxes. Size ``T``
+
+        Returns:
+            A (Float)Tensor of shape ``[N x T]`` with values in ``[0..1]``.
+            If requested, the softmax is computed along the -1 dimension,
+            resulting in probability distributions for each value of the input data.
+        """
         return self.softmax(box_iou(self.get_data(ds=data), self.get_target(ds=target)))
diff --git a/dgs/models/similarity/torchreid.py b/dgs/models/similarity/torchreid.py
@@ -129,11 +129,13 @@ def forward(self, data: State, target: State) -> torch.Tensor:
 
         Returns:
             A similarity matrix containing values describing the similarity between every current- and target-embedding.
-            The similarity should be (Float)Tensor of shape ``[a x b]`` with values in ``[0..1]``.
+            The similarity is a (Float)Tensor of shape ``[a x b]`` with values in ``[0..1]``.
             If the provided metric does not return a probability distribution,
             you might want to change the metric or set the 'softmax' parameter of this module,
             or within the :class:`DGSModule` if this is a submodule.
             Computing the softmax ensures better / correct behavior when combining this similarity with others.
+            If requested, the softmax is computed along the -1 dimension,
+            resulting in probability distributions for each value of the input data.
         """
         pred_embeds = self.get_data(ds=data)
         targ_embeds = self.get_target(ds=target)

diff --git a/dgs/models/submission/MOT.py b/dgs/models/submission/MOT.py
@@ -69,7 +69,8 @@ def _get_bbox_value(_s: State, idx: int) -> str:
 
         # convert bbox format to receive the height and width more easily later on
         if s.bbox.format != tvte.BoundingBoxFormat.XYWH:
-            convert_bounding_box_format(s.bbox, new_format=tvte.BoundingBoxFormat.XYWH)
+            s.bbox = convert_bounding_box_format(s.bbox, new_format=tvte.BoundingBoxFormat.XYWH)
+        assert s.bbox.format == tvte.BoundingBoxFormat.XYWH, f"got format: {s.bbox.format}"
         detections = s.split()
         for det in detections:
             tid = det["pred_tid"].item() + 1  # MOT is 1-indexed, but State is 0-indexed

diff --git a/dgs/models/submission/posetrack21.py b/dgs/models/submission/posetrack21.py
@@ -120,7 +120,10 @@ def get_anno_data(s: State) -> list[dict[str, any]]:
 
         # get the annotation data
         anno_data = []
-        bboxes = convert_bounding_box_format(s.bbox, new_format=tvte.BoundingBoxFormat.XYWH)
+        if s.bbox.format != tvte.BoundingBoxFormat.XYWH:
+            s.bbox = convert_bounding_box_format(s.bbox, new_format=tvte.BoundingBoxFormat.XYWH)
+        assert s.bbox.format == tvte.BoundingBoxFormat.XYWH, f"got format: {s.bbox.format}"
+
         for i in range(s.B):
             kps = t.cat([s.keypoints[i], s.joint_weight[i]], dim=-1)
             scores: list[float]
@@ -134,7 +137,7 @@ def get_anno_data(s: State) -> list[dict[str, any]]:
 
             anno_data.append(
                 {
-                    "bboxes": bboxes[i].flatten().tolist(),
+                    "bboxes": s.bbox[i].flatten().tolist(),
                     "keypoints": kps.flatten().tolist(),
                     "scores": scores,
                     "score": (

diff --git a/dgs/utils/state.py b/dgs/utils/state.py
@@ -201,10 +201,11 @@ def bbox(self) -> tv_tensors.BoundingBoxes:
 
     @bbox.setter
     def bbox(self, bbox: tv_tensors) -> None:
-        raise NotImplementedError(
-            "It is not allowed to change the bounding box of an already existing State object. "
-            "Create a new object instead!"
-        )
+        if not isinstance(bbox, tv_tensors.BoundingBoxes):
+            raise TypeError(f"Expected bounding box, got {type(bbox)}")
+        if bbox.shape != self.bbox.shape:
+            raise ValueError(f"Can't switch bbox shape. Expected {self.bbox.shape} but got {bbox.shape}")
+        self.data["bbox"] = bbox.to(device=self.bbox.device)
 
     @property
     def device(self):

diff --git a/scripts/helpers/extract_bboxes_MOT.py b/scripts/helpers/extract_bboxes_MOT.py
@@ -18,7 +18,7 @@
 
 CONFIG_FILE: str = "./configs/helpers/predict_rcnn.yaml"
 
-SCORE_THRESHS: list[float] = [0.85, 0.90, 0.95]
+SCORE_THRESHS: list[float] = [0.85, 0.90, 0.95, 0.99]
 IOU_THRESHS: list[float] = [1.0]  # basically deactivate IoU thresh
 
 RCNN_DL_KEYS: list[str] = [

diff --git a/tests/utils/state/test__state.py b/tests/utils/state/test__state.py
@@ -136,11 +136,14 @@ def test_keypoints(self):
                     ds = State(**{"bbox": DUMMY_BBOX, scope: DUMMY_KP, "validate": validate})
                     setattr(ds, scopes[(i + 1) % 2], DUMMY_KP)
 
-    def test_setting_bbox_fails(self):
+    def test_setting_bbox_exceptions(self):
         ds = State(**DUMMY_DATA)
-        with self.assertRaises(NotImplementedError) as e:
-            ds.bbox = DUMMY_BBOX
-        self.assertTrue("not allowed to change the bounding box of an already" in str(e.exception), msg=e.exception)
+        with self.assertRaises(TypeError) as e:
+            ds.bbox = DUMMY_BBOX_TENSOR
+        self.assertTrue("Expected bounding box, got" in str(e.exception), msg=e.exception)
+        with self.assertRaises(ValueError) as e:
+            ds.bbox = DUMMY_BBOX_BATCH
+        self.assertTrue("Can't switch bbox shape. Expected" in str(e.exception), msg=e.exception)
 
     def test_filepath(self):
         for validate in [True, False]: