Skip to content

Commit

Permalink
Predict using images generated from Video
Browse files Browse the repository at this point in the history
- many bug fixes
- separated images and videos in test_data folder
- updated lucid-chart link
- torchreid similarity now uses torchreid embedding generator

Signed-off-by: Martin <[email protected]>
  • Loading branch information
bmmtstb committed Apr 11, 2024
1 parent 8e56365 commit 63559d2
Show file tree
Hide file tree
Showing 21 changed files with 375 additions and 264 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ You can find the extended Documentation [here](https://bmmtstb.github.io/dynamic
## Pipeline

You can find a visual Pipeline on
[LucidChart](https://lucid.app/documents/view/848ef9df-ac3d-464d-912f-f5760b6cfbe9)
or directly download it as PDF [here](https://lucid.app/publicSegments/view/ddbebe1b-4bd3-46b8-9dfd-709b281c4b01).
[LucidChart](https://lucid.app/lucidchart/848ef9df-ac3d-464d-912f-f5760b6cfbe9/edit?viewport_loc=19%2C-867%2C1761%2C3019%2CnKP9V3Rhwz2T&invitationId=inv_e5a52469-f95f-414f-a78b-3416435fcb2d)
or directly download it as PDF [here](./docs/algorithm_structure/Pipeline%20-%20Main.pdf).
58 changes: 58 additions & 0 deletions configs/predict_images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: "Predict-Images"
description: "Use the DGS tracker to track and predict given a directory of images as input."

device: "cuda"
print_prio: "DEBUG"
is_training: off
log_dir: "./results/own/dgs_predict/rcnn_images/"

test:
inactivity_threshold: 15
max_track_length: 1
save_images: on
show_keypoints: on
show_skeleton: on
draw_kwargs:
bbox_font_size: 90
bbox_width: 8

# #### #
# DATA #
# #### #

dataloader_test:
module_name: "ImageRCNN"
dataset_path: "./data/"
path: "./test/"
batch_size: 1 # fixme
threshold: 0.75

# ####### #
# MODULES #
# ####### #

dgs:
module_name: "DGS"
combine: "combine_similarities"
names: ["visual_similarity", "pose_similarity", "box_similarity"]

combine_similarities:
module_name: "constant_alpha"
alpha: [0.5, 0.25, 0.25]

visual_similarity:
module_name: "torchreid"
similarity: "NegSoftmaxEuclideanDist"
embedding_generator_path: ["visual_similarity", "vis_emb_gen"]
vis_emb_gen:
module_name: "torchreid"
nof_classes: 1000
weights: "pretrained"
model_name: "osnet_x1_0"

pose_similarity:
module_name: "oks"
format: "coco"

box_similarity:
module_name: "iou"
14 changes: 9 additions & 5 deletions configs/predict_video.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ description: "Use the DGS tracker to track and predict a video input."
device: "cuda"
print_prio: "DEBUG"
is_training: off
log_dir: "./results/own/dgs_predict/rcnn/"
log_dir: "./results/own/dgs_predict/rcnn_video/"

test:
inactivity_threshold: 15
max_track_length: 1
save_images: on
show_keypoints: off
show_skeleton: off
show_keypoints: on
show_skeleton: on
draw_kwargs:
bbox_font_size: 60
bbox_width: 5
Expand Down Expand Up @@ -42,9 +42,13 @@ combine_similarities:

visual_similarity:
module_name: "torchreid"
model_name: "osnet_x1_0"
similarity: "NegSoftmaxEuclideanDist"
weights: "pretrained"
embedding_generator_path: ["visual_similarity", "vis_emb_gen"]
vis_emb_gen:
module_name: "torchreid"
nof_classes: 1000
weights: "pretrained"
model_name: "osnet_x1_0"

pose_similarity:
module_name: "oks"
Expand Down
6 changes: 5 additions & 1 deletion dgs/default_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@ dgs_engine:

backbone:
kprcnn:
threshold: 0.3
threshold: 0.3

similarity:
torchreid:
compute_softmax: true
4 changes: 2 additions & 2 deletions dgs/models/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,8 @@ class VideoDataset(BaseDataset, ABC):
Params
------
path (:obj:`.FilePath`)
path (:obj:`.FilePath`):
A single path to a video file.
Optional Params
---------------
Expand Down
71 changes: 45 additions & 26 deletions dgs/models/dataset/keypoint_rcnn.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
"""
Use 'keypointrcnn_resnet50_fpn' from PyTorch.
Use :func:`.keypointrcnn_resnet50_fpn` to predict the key points and bounding boxes of each image.
References:
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
"""

import os
from abc import ABC

import torch
from torch.nn import Module as TorchModule
from torch import nn
from torchvision import tv_tensors as tvte
from torchvision.io import VideoReader
from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
Expand All @@ -34,11 +34,23 @@
}


class KeypointRCNNBackbone(BaseDataset, TorchModule, ABC):
class KeypointRCNNBackbone(BaseDataset, nn.Module, ABC):
"""Metaclass for the torchvision Key Point RCNN backbone model.
This class sets up the RCNN model and validates and sets the basic modules parameters.
Params
------
threshold (float):
Detections with a score lower than the threshold will be ignored.
Default `DEF_CONF.backbone.kprcnn.threshold`.
"""

def __init__(self, config: Config, path: NodePath) -> None:
BaseDataset.__init__(self, config, path)
TorchModule.__init__(self)
nn.Module.__init__(self)

self.validate_params(rcnn_validations)

Expand All @@ -49,8 +61,16 @@ def __init__(self, config: Config, path: NodePath) -> None:
self.model.eval()
self.model.to(self.device)

def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
""""""
def images_to_states(self, images: Images) -> State:
"""Given a list of images, use the key-point-RCNN model to predict key points and bounding boxes,
then create a :class:`State` containing the available information.
Notes:
Does not add the original image to the new State, to reduce memory / GPU usage.
"""

outputs = self.model(images)

states = []
canvas_size = (max(i.shape[-2] for i in images), max(i.shape[-1] for i in images))

Expand All @@ -67,10 +87,9 @@ def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
.reshape((-1, 17, 3))
.split([2, 1], dim=-1)
)
new_images = [tvte.Image(image.unsqueeze(0)) for _ in range(len(bbox))]

crops, loc_kps = extract_crops_from_images(
imgs=new_images,
imgs=[tvte.Image(image.unsqueeze(0)) for _ in range(len(bbox))],
bboxes=bbox,
kps=kps,
crop_size=self.params.get("crop_size", DEF_CONF.images.crop_size),
Expand All @@ -81,14 +100,15 @@ def outputs_to_states(self, outputs: list[dict], images: Images) -> State:
crops = tvte.wrap(crops.unsqueeze(0), like=crops)

data = {
"validate": True, # fixme remove
# "validate": False,
# "validate": True, # fixme remove
"validate": False,
"bbox": bbox,
"image_crop": crops,
"keypoints": kps,
"keypoints_local": loc_kps,
"joint_weight": vis,
"scores": output["scores"],
"skeleton_name": "coco",
}
states.append(State(**data))

Expand All @@ -101,7 +121,7 @@ class KeypointRCNNImageBackbone(KeypointRCNNBackbone):
Predicts 17 key-points (like COCO).
References:
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
Params
------
Expand Down Expand Up @@ -132,7 +152,9 @@ def __init__(self, config: Config, path: NodePath) -> None:
self.data = []
path = self.params["path"]
if isinstance(path, list):
self.data = path
assert all(isinstance(p, str) for p in path), "Path is a list but not all values are string"
assert all(any(p.lower().endswith(end) for end in IMAGE_FORMATS) for p in path), "Not all values are images"
self.data = sorted(path)
elif isinstance(path, str):
path = self.get_path_in_dataset(path)
if is_file(path):
Expand All @@ -148,40 +170,39 @@ def __init__(self, config: Config, path: NodePath) -> None:
# directory of images
self.data = [
os.path.normpath(os.path.join(path, child_path))
for child_path in tqdm(os.listdir(path), desc="Loading images", total=len(os.listdir(path)))
for child_path in tqdm(sorted(os.listdir(path)), desc="Loading images", total=len(os.listdir(path)))
if any(child_path.lower().endswith(ending) for ending in IMAGE_FORMATS)
]
else:
raise ValueError(f"string is neither file nor dir. Got '{path}'.") # pragma: no cover
raise NotImplementedError(f"string is neither file nor dir. Got '{path}'.")
else:
raise TypeError(
raise NotImplementedError(
f"Unknown path object, expected filepath, dirpath, or list of filepaths. Got {type(path)}"
) # pragma: no cover
)

def arbitrary_to_ds(self, a: FilePath, idx: int) -> State:
"""Given a filepath, predict the bounding boxes and key-points of the respective image.
Return a State containing all the available information.
Because the state is known, the image is not saved in the State, to reduce the space-overhead on the GPU.
"""
# the torch model expects a 3D image
images = [convert_image_dtype(tvte.Image(load_image(a), device=self.device), dtype=torch.float32)]

outputs = self.model(images)
# the torch model expects a list of 3D images
images = [convert_image_dtype(tvte.Image(load_image(a).squeeze(0), device=self.device), dtype=torch.float32)]

s = self.outputs_to_states(outputs=outputs, images=images)
s = self.images_to_states(images=images)

s.filepath = tuple(a for _ in range(len(s)))

return s


# pylint: disable=too-many-ancestors
class KeypointRCNNVideoBackbone(KeypointRCNNBackbone, VideoDataset):
"""A Dataset that gets the path to a single Video file and predicts the bounding boxes and key points of the Video.
Predicts 17 key-points (like COCO).
References:
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html#torchvision.models.detection.keypointrcnn_resnet50_fpn
https://pytorch.org/vision/0.17/models/generated/torchvision.models.detection.keypointrcnn_resnet50_fpn.html
Params
------
Expand Down Expand Up @@ -215,9 +236,7 @@ def arbitrary_to_ds(self, a: Image, idx: int) -> State:
# the torch RCNN model expects a list of 3D images
images = [convert_image_dtype(a, torch.float32)]

outputs = self.model(images)

s = self.outputs_to_states(outputs=outputs, images=images)
s = self.images_to_states(images=images)

s.image = [a.unsqueeze(0) for _ in range(len(s))]

Expand Down
53 changes: 53 additions & 0 deletions dgs/models/embedding_generator/torchreid.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from dgs.models.embedding_generator.embedding_generator import EmbeddingGeneratorModule
from dgs.utils.config import get_sub_config, insert_into_config
from dgs.utils.exceptions import InvalidPathException
from dgs.utils.files import to_abspath
from dgs.utils.state import State
from dgs.utils.torchtools import configure_torch_module, load_pretrained_weights
Expand Down Expand Up @@ -77,6 +78,8 @@ class TorchreidEmbeddingGenerator(EmbeddingGeneratorModule):
model: nn.Module

def __init__(self, config, path):
if path is None:
raise InvalidPathException("path is required but got None")
sub_cfg = get_sub_config(config, path)
if "embedding_size" in sub_cfg and sub_cfg["embedding_size"] != 512:
warnings.warn(
Expand Down Expand Up @@ -105,6 +108,56 @@ def _init_model(self, pretrained: bool) -> nn.Module:
# send model to the device
return self.configure_torch_module(m, train=False)

def predict_embeddings(self, data: torch.Tensor) -> torch.Tensor:
"""Predict embeddings given some input.
Args:
data: The input for the model, most likely a cropped image.
Returns:
Tensor containing a batch B of embeddings.
Shape: ``[B x E]``
"""

def _get_torchreid_embeds(r) -> torch.Tensor:
"""Torchreid returns embeddings during eval and ids during training."""
if isinstance(r, torch.Tensor):
# During model building, triplet loss was forced for torchreid models.
# Therefore, only one return value means that only the embeddings are returned
return r
if len(r) == 2:
_, embeddings = r
return embeddings
raise NotImplementedError("Unknown torchreid model output.")

results = self.model(data)
return _get_torchreid_embeds(results)

def predict_ids(self, data: torch.Tensor) -> torch.Tensor:
"""Predict class IDs given some input.
Args:
data: The input for the model, most likely a cropped image.
Returns:
Tensor containing class predictions, which are not necessarily a probability distribution.
Shape: ``[B x num_classes]``
"""

def _get_torchreid_ids(r) -> torch.Tensor:
"""Torchreid returns embeddings during eval and ids during training."""
if isinstance(r, torch.Tensor):
# During model building, triplet loss was forced for torchreid models.
# Therefore, only one return value means that only the embeddings are returned
return self.model.classifier(r)
if len(r) == 2:
ids, _ = r
return ids
raise NotImplementedError("Unknown torchreid model output.")

results = self.model(data)
return _get_torchreid_ids(results)

def forward(self, ds: State) -> torch.Tensor:
"""Predict embeddings given some input.
Expand Down
Loading

0 comments on commit 63559d2

Please sign in to comment.