Predict using images generated from Video

- many bug fixes - separated images and videos in test_data folder - updated lucid-chart link - torchreid similarity now uses torchreid embedding generator Signed-off-by: Martin <[email protected]>
bmmtstb · Apr 11, 2024 · 63559d2 · 63559d2
1 parent 8e56365
commit 63559d2
Show file tree

Hide file tree

Showing 21 changed files with 375 additions and 264 deletions.
diff --git a/README.md b/README.md
@@ -13,5 +13,5 @@ You can find the extended Documentation [here](https://bmmtstb.github.io/dynamic
 ## Pipeline
 
 You can find a visual Pipeline on
-[LucidChart](https://lucid.app/documents/view/848ef9df-ac3d-464d-912f-f5760b6cfbe9)
-or directly download it as PDF [here](https://lucid.app/publicSegments/view/ddbebe1b-4bd3-46b8-9dfd-709b281c4b01).
+[LucidChart](https://lucid.app/lucidchart/848ef9df-ac3d-464d-912f-f5760b6cfbe9/edit?viewport_loc=19%2C-867%2C1761%2C3019%2CnKP9V3Rhwz2T&invitationId=inv_e5a52469-f95f-414f-a78b-3416435fcb2d)
+or directly download it as PDF [here](./docs/algorithm_structure/Pipeline%20-%20Main.pdf).
diff --git a/configs/predict_images.yaml b/configs/predict_images.yaml
@@ -0,0 +1,58 @@
+name: "Predict-Images"
+description: "Use the DGS tracker to track and predict given a directory of images as input."
+
+device: "cuda"
+print_prio: "DEBUG"
+is_training: off
+log_dir: "./results/own/dgs_predict/rcnn_images/"
+
+test:
+    inactivity_threshold: 15
+    max_track_length: 1
+    save_images: on
+    show_keypoints: on
+    show_skeleton: on
+    draw_kwargs:
+        bbox_font_size: 90
+        bbox_width: 8
+
+# #### #
+# DATA #
+# #### #
+
+dataloader_test:
+    module_name: "ImageRCNN"
+    dataset_path: "./data/"
+    path: "./test/"
+    batch_size: 1  # fixme
+    threshold: 0.75
+
+# ####### #
+# MODULES #
+# ####### #
+
+dgs:
+    module_name: "DGS"
+    combine: "combine_similarities"
+    names: ["visual_similarity", "pose_similarity", "box_similarity"]
+
+combine_similarities:
+    module_name: "constant_alpha"
+    alpha: [0.5, 0.25, 0.25]
+
+visual_similarity:
+    module_name: "torchreid"
+    similarity: "NegSoftmaxEuclideanDist"
+    embedding_generator_path: ["visual_similarity", "vis_emb_gen"]
+    vis_emb_gen:
+        module_name: "torchreid"
+        nof_classes: 1000
+        weights: "pretrained"
+        model_name: "osnet_x1_0"
+
+pose_similarity:
+    module_name: "oks"
+    format: "coco"
+
+box_similarity:
+    module_name: "iou"
diff --git a/configs/predict_video.yaml b/configs/predict_video.yaml
@@ -4,14 +4,14 @@ description: "Use the DGS tracker to track and predict a video input."
 device: "cuda"
 print_prio: "DEBUG"
 is_training: off
-log_dir: "./results/own/dgs_predict/rcnn/"
+log_dir: "./results/own/dgs_predict/rcnn_video/"
 
 test:
     inactivity_threshold: 15
     max_track_length: 1
     save_images: on
-    show_keypoints: off
-    show_skeleton: off
+    show_keypoints: on
+    show_skeleton: on
     draw_kwargs:
         bbox_font_size: 60
         bbox_width: 5
@@ -42,9 +42,13 @@ combine_similarities:
 
 visual_similarity:
     module_name: "torchreid"
-    model_name: "osnet_x1_0"
     similarity: "NegSoftmaxEuclideanDist"
-    weights: "pretrained"
+    embedding_generator_path: ["visual_similarity", "vis_emb_gen"]
+    vis_emb_gen:
+        module_name: "torchreid"
+        nof_classes: 1000
+        weights: "pretrained"
+        model_name: "osnet_x1_0"
 
 pose_similarity:
     module_name: "oks"

diff --git a/dgs/default_values.yaml b/dgs/default_values.yaml
@@ -48,4 +48,8 @@ dgs_engine:
 
 backbone:
     kprcnn:
-        threshold: 0.3
+        threshold: 0.3
+
+similarity:
+    torchreid:
+        compute_softmax: true
diff --git a/dgs/models/dataset/dataset.py b/dgs/models/dataset/dataset.py
@@ -346,8 +346,8 @@ class VideoDataset(BaseDataset, ABC):
     Params
     ------
 
-    path (:obj:`.FilePath`)
-
+    path (:obj:`.FilePath`):
+        A single path to a video file.
 
     Optional Params
     ---------------

diff --git a/dgs/models/dataset/keypoint_rcnn.py b/dgs/models/dataset/keypoint_rcnn.py
@@ -1,15 +1,15 @@
 """
-Use 'keypointrcnn_resnet50_fpn' from PyTorch.
+Use :func:`.keypointrcnn_resnet50_fpn` to predict the key points and bounding boxes of each image.
 
 References:
-    https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
+    https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
 """
 
 import os
 from abc import ABC
 
 import torch
-from torch.nn import Module as TorchModule
+from torch import nn
 from torchvision import tv_tensors as tvte
 from torchvision.io import VideoReader
 from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
@@ -34,11 +34,23 @@
 }
 
 
-class KeypointRCNNBackbone(BaseDataset, TorchModule, ABC):
+class KeypointRCNNBackbone(BaseDataset, nn.Module, ABC):
+    """Metaclass for the torchvision Key Point RCNN backbone model.
+
+    This class sets up the RCNN model and validates and sets the basic modules parameters.
+
+    Params
+    ------
+
+    threshold (float):
+        Detections with a score lower than the threshold will be ignored.
+        Default `DEF_CONF.backbone.kprcnn.threshold`.
+
+    """
 
     def __init__(self, config: Config, path: NodePath) -> None:
         BaseDataset.__init__(self, config, path)
-        TorchModule.__init__(self)
+        nn.Module.__init__(self)
 
         self.validate_params(rcnn_validations)
 
@@ -49,8 +61,16 @@ def __init__(self, config: Config, path: NodePath) -> None:
         self.model.eval()
         self.model.to(self.device)
 
-    def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
-        """"""
+    def images_to_states(self, images: Images) -> State:
+        """Given a list of images, use the key-point-RCNN model to predict key points and bounding boxes,
+        then create a :class:`State` containing the available information.
+
+        Notes:
+            Does not add the original image to the new State, to reduce memory / GPU usage.
+        """
+
+        outputs = self.model(images)
+
         states = []
         canvas_size = (max(i.shape[-2] for i in images), max(i.shape[-1] for i in images))
 
@@ -67,10 +87,9 @@ def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
                 .reshape((-1, 17, 3))
                 .split([2, 1], dim=-1)
             )
-            new_images = [tvte.Image(image.unsqueeze(0)) for _ in range(len(bbox))]
 
             crops, loc_kps = extract_crops_from_images(
-                imgs=new_images,
+                imgs=[tvte.Image(image.unsqueeze(0)) for _ in range(len(bbox))],
                 bboxes=bbox,
                 kps=kps,
                 crop_size=self.params.get("crop_size", DEF_CONF.images.crop_size),
@@ -81,14 +100,15 @@ def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
                 crops = tvte.wrap(crops.unsqueeze(0), like=crops)
 
             data = {
-                "validate": True,  # fixme remove
-                # "validate": False,
+                # "validate": True,  # fixme remove
+                "validate": False,
                 "bbox": bbox,
                 "image_crop": crops,
                 "keypoints": kps,
                 "keypoints_local": loc_kps,
                 "joint_weight": vis,
                 "scores": output["scores"],
+                "skeleton_name": "coco",
             }
             states.append(State(**data))
 
@@ -101,7 +121,7 @@ class KeypointRCNNImageBackbone(KeypointRCNNBackbone):
     Predicts 17 key-points (like COCO).
 
     References:
-        https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
+        https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
 
     Params
     ------
@@ -132,7 +152,9 @@ def __init__(self, config: Config, path: NodePath) -> None:
         self.data = []
         path = self.params["path"]
         if isinstance(path, list):
-            self.data = path
+            assert all(isinstance(p, str) for p in path), "Path is a list but not all values are string"
+            assert all(any(p.lower().endswith(end) for end in IMAGE_FORMATS) for p in path), "Not all values are images"
+            self.data = sorted(path)
         elif isinstance(path, str):
             path = self.get_path_in_dataset(path)
             if is_file(path):
@@ -148,40 +170,39 @@ def __init__(self, config: Config, path: NodePath) -> None:
                 # directory of images
                 self.data = [
                     os.path.normpath(os.path.join(path, child_path))
-                    for child_path in tqdm(os.listdir(path), desc="Loading images", total=len(os.listdir(path)))
+                    for child_path in tqdm(sorted(os.listdir(path)), desc="Loading images", total=len(os.listdir(path)))
                     if any(child_path.lower().endswith(ending) for ending in IMAGE_FORMATS)
                 ]
             else:
-                raise ValueError(f"string is neither file nor dir. Got '{path}'.")  # pragma: no cover
+                raise NotImplementedError(f"string is neither file nor dir. Got '{path}'.")
         else:
-            raise TypeError(
+            raise NotImplementedError(
                 f"Unknown path object, expected filepath, dirpath, or list of filepaths. Got {type(path)}"
-            )  # pragma: no cover
+            )
 
     def arbitrary_to_ds(self, a: FilePath, idx: int) -> State:
         """Given a filepath, predict the bounding boxes and key-points of the respective image.
         Return a State containing all the available information.
         Because the state is known, the image is not saved in the State, to reduce the space-overhead on the GPU.
         """
-        # the torch model expects a 3D image
-        images = [convert_image_dtype(tvte.Image(load_image(a), device=self.device), dtype=torch.float32)]
-
-        outputs = self.model(images)
+        # the torch model expects a list of 3D images
+        images = [convert_image_dtype(tvte.Image(load_image(a).squeeze(0), device=self.device), dtype=torch.float32)]
 
-        s = self.outputs_to_states(outputs=outputs, images=images)
+        s = self.images_to_states(images=images)
 
         s.filepath = tuple(a for _ in range(len(s)))
 
         return s
 
 
+# pylint: disable=too-many-ancestors
 class KeypointRCNNVideoBackbone(KeypointRCNNBackbone, VideoDataset):
     """A Dataset that gets the path to a single Video file and predicts the bounding boxes and key points of the Video.
 
     Predicts 17 key-points (like COCO).
 
     References:
-        https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
+        https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
 
     Params
     ------
@@ -215,9 +236,7 @@ def arbitrary_to_ds(self, a: Image, idx: int) -> State:
         # the torch RCNN model expects a list of 3D images
         images = [convert_image_dtype(a, torch.float32)]
 
-        outputs = self.model(images)
-
-        s = self.outputs_to_states(outputs=outputs, images=images)
+        s = self.images_to_states(images=images)
 
         s.image = [a.unsqueeze(0) for _ in range(len(s))]
 

diff --git a/dgs/models/embedding_generator/torchreid.py b/dgs/models/embedding_generator/torchreid.py
@@ -9,6 +9,7 @@
 
 from dgs.models.embedding_generator.embedding_generator import EmbeddingGeneratorModule
 from dgs.utils.config import get_sub_config, insert_into_config
+from dgs.utils.exceptions import InvalidPathException
 from dgs.utils.files import to_abspath
 from dgs.utils.state import State
 from dgs.utils.torchtools import configure_torch_module, load_pretrained_weights
@@ -77,6 +78,8 @@ class TorchreidEmbeddingGenerator(EmbeddingGeneratorModule):
     model: nn.Module
 
     def __init__(self, config, path):
+        if path is None:
+            raise InvalidPathException("path is required but got None")
         sub_cfg = get_sub_config(config, path)
         if "embedding_size" in sub_cfg and sub_cfg["embedding_size"] != 512:
             warnings.warn(
@@ -105,6 +108,56 @@ def _init_model(self, pretrained: bool) -> nn.Module:
         # send model to the device
         return self.configure_torch_module(m, train=False)
 
+    def predict_embeddings(self, data: torch.Tensor) -> torch.Tensor:
+        """Predict embeddings given some input.
+
+        Args:
+            data: The input for the model, most likely a cropped image.
+
+        Returns:
+            Tensor containing a batch B of embeddings.
+            Shape: ``[B x E]``
+        """
+
+        def _get_torchreid_embeds(r) -> torch.Tensor:
+            """Torchreid returns embeddings during eval and ids during training."""
+            if isinstance(r, torch.Tensor):
+                # During model building, triplet loss was forced for torchreid models.
+                # Therefore, only one return value means that only the embeddings are returned
+                return r
+            if len(r) == 2:
+                _, embeddings = r
+                return embeddings
+            raise NotImplementedError("Unknown torchreid model output.")
+
+        results = self.model(data)
+        return _get_torchreid_embeds(results)
+
+    def predict_ids(self, data: torch.Tensor) -> torch.Tensor:
+        """Predict class IDs given some input.
+
+        Args:
+            data: The input for the model, most likely a cropped image.
+
+        Returns:
+            Tensor containing class predictions, which are not necessarily a probability distribution.
+            Shape: ``[B x num_classes]``
+        """
+
+        def _get_torchreid_ids(r) -> torch.Tensor:
+            """Torchreid returns embeddings during eval and ids during training."""
+            if isinstance(r, torch.Tensor):
+                # During model building, triplet loss was forced for torchreid models.
+                # Therefore, only one return value means that only the embeddings are returned
+                return self.model.classifier(r)
+            if len(r) == 2:
+                ids, _ = r
+                return ids
+            raise NotImplementedError("Unknown torchreid model output.")
+
+        results = self.model(data)
+        return _get_torchreid_ids(results)
+
     def forward(self, ds: State) -> torch.Tensor:
         """Predict embeddings given some input.