From 9742491d926ca173fa2c3ffce06b0cc671e9d1ab Mon Sep 17 00:00:00 2001 From: Peng Lu Date: Wed, 13 Sep 2023 19:06:07 +0800 Subject: [PATCH 1/4] [Fix] fix vitpose pretrained ckpts (#2687) --- .../coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py | 2 +- .../coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py index 9732371787..5a55780505 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_base.pth'), + 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'), ), neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), head=dict( diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py index fc08c61dff..06522b7b91 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_base.pth'), + 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'), ), head=dict( type='HeatmapHead', diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py index 7d94f97c1b..03ae669807 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_huge.pth'), + 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'), ), neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), head=dict( diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py index 4aa2c21c1f..6b8afcf0f4 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_huge.pth'), + 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'), ), head=dict( type='HeatmapHead', diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py index cf875d5167..2035e786df 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_large.pth'), + 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'), ), neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), head=dict( diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py index 5ba6eafb4b..f1d0e90578 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_large.pth'), + 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'), ), head=dict( type='HeatmapHead', diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py index 88bd3e43e3..d8216089b7 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py @@ -76,7 +76,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_small.pth'), + 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'), ), neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), head=dict( diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py index 791f9b5945..5b77da96eb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py @@ -76,7 +76,7 @@ init_cfg=dict( type='Pretrained', checkpoint='https://download.openmmlab.com/mmpose/' - 'v1/pretrained_models/mae_pretrain_vit_small.pth'), + 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'), ), head=dict( type='HeatmapHead', From 7bea17cdf362cdd731fc5e79b982e4b79be5cca9 Mon Sep 17 00:00:00 2001 From: Peng Lu Date: Thu, 14 Sep 2023 14:05:03 +0800 Subject: [PATCH 2/4] [Refactor] Refactor YOLOX-Pose into mmpose core package (#2620) --- configs/body_2d_keypoint/yoloxpose/README.md | 22 + .../yoloxpose/coco/yoloxpose_coco.md | 59 ++ .../yoloxpose/coco/yoloxpose_coco.yml | 72 ++ .../coco/yoloxpose_l_8xb32-300e_coco-640.py | 17 + .../coco/yoloxpose_m_8xb32-300e_coco-640.py | 16 + .../coco/yoloxpose_s_8xb32-300e_coco-640.py | 266 +++++++ .../yoloxpose_tiny_4xb64-300e_coco-416.py | 77 ++ docs/src/papers/algorithms/yolopose.md | 30 + mmpose/codecs/__init__.py | 3 +- mmpose/codecs/annotation_processors.py | 92 +++ mmpose/datasets/dataset_wrappers.py | 8 + .../datasets/base/base_coco_style_dataset.py | 27 +- .../datasets/datasets/body/jhmdb_dataset.py | 4 + mmpose/datasets/datasets/body/mpii_dataset.py | 8 + mmpose/datasets/transforms/__init__.py | 12 +- .../transforms/bottomup_transforms.py | 184 +++-- .../datasets/transforms/common_transforms.py | 177 +++++ .../datasets/transforms/mix_img_transforms.py | 501 ++++++++++++ mmpose/engine/__init__.py | 1 + mmpose/engine/hooks/__init__.py | 7 +- mmpose/engine/hooks/mode_switch_hooks.py | 65 ++ mmpose/engine/hooks/sync_norm_hook.py | 41 + mmpose/engine/schedulers/__init__.py | 8 + mmpose/engine/schedulers/quadratic_warmup.py | 131 +++ mmpose/evaluation/functional/__init__.py | 5 +- mmpose/evaluation/functional/nms.py | 41 + mmpose/evaluation/metrics/coco_metric.py | 24 +- mmpose/models/backbones/__init__.py | 5 +- mmpose/models/backbones/csp_darknet.py | 286 +++++++ mmpose/models/backbones/cspnext.py | 195 +++++ mmpose/models/data_preprocessors/__init__.py | 3 +- .../data_preprocessors/batch_augmentation.py | 115 +++ .../data_preprocessors/data_preprocessor.py | 90 +++ mmpose/models/heads/hybrid_heads/__init__.py | 3 +- .../heads/hybrid_heads/yoloxpose_head.py | 752 ++++++++++++++++++ mmpose/models/losses/__init__.py | 7 +- mmpose/models/losses/bbox_loss.py | 72 ++ mmpose/models/losses/classification_loss.py | 22 +- mmpose/models/losses/regression_loss.py | 123 ++- mmpose/models/necks/__init__.py | 5 +- mmpose/models/necks/channel_mapper.py | 106 +++ mmpose/models/necks/yolox_pafpn.py | 156 ++++ mmpose/models/pose_estimators/base.py | 15 +- mmpose/models/pose_estimators/bottomup.py | 13 +- mmpose/models/task_modules/__init__.py | 3 + .../models/task_modules/assigners/__init__.py | 5 + .../assigners/metric_calculators.py | 108 +++ .../assigners/sim_ota_assigner.py | 284 +++++++ .../task_modules/prior_generators/__init__.py | 2 + .../prior_generators/mlvl_point_generator.py | 245 ++++++ mmpose/models/utils/__init__.py | 4 +- mmpose/models/utils/csp_layer.py | 273 +++++++ mmpose/models/utils/misc.py | 76 ++ mmpose/registry.py | 6 +- mmpose/structures/__init__.py | 13 +- mmpose/structures/bbox/__init__.py | 12 +- mmpose/structures/bbox/bbox_overlaps.py | 117 +++ mmpose/structures/bbox/transforms.py | 157 +++- mmpose/structures/keypoint/__init__.py | 7 +- mmpose/structures/keypoint/transforms.py | 30 + mmpose/utils/__init__.py | 3 +- mmpose/utils/dist_utils.py | 11 + mmpose/utils/tensor_utils.py | 3 + model-index.yml | 1 + projects/yolox_pose/README.md | 10 +- projects/yolox_pose/datasets/__init__.py | 10 + projects/yolox_pose/models/__init__.py | 10 + .../test_codecs/test_annotation_processors.py | 35 + .../test_body_datasets/test_aic_dataset.py | 1 + .../test_body_datasets/test_coco_dataset.py | 1 + .../test_crowdpose_dataset.py | 1 + .../test_body_datasets/test_jhmdb_dataset.py | 1 + .../test_body_datasets/test_mhp_dataset.py | 1 + .../test_body_datasets/test_mpii_dataset.py | 1 + .../test_posetrack18_dataset.py | 1 + .../test_transforms/test_common_transforms.py | 139 +++- .../test_transforms/test_mix_img_transform.py | 115 +++ .../test_hooks/test_mode_switch_hooks.py | 67 ++ .../test_hooks/test_sync_norm_hook.py | 44 + .../test_schedulers/test_quadratic_warmup.py | 108 +++ .../test_functional/test_nms.py | 21 +- .../test_backbones/test_csp_darknet.py | 125 +++ .../test_data_preprocessor.py | 135 ++++ .../test_necks/test_yolox_pafpn.py | 30 + .../test_bbox/test_bbox_overlaps.py | 75 ++ .../test_bbox/test_bbox_transforms.py | 126 +++ .../test_keypoint/test_keypoint_transforms.py | 57 ++ 87 files changed, 6236 insertions(+), 104 deletions(-) create mode 100644 configs/body_2d_keypoint/yoloxpose/README.md create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py create mode 100644 docs/src/papers/algorithms/yolopose.md create mode 100644 mmpose/codecs/annotation_processors.py create mode 100644 mmpose/datasets/transforms/mix_img_transforms.py create mode 100644 mmpose/engine/hooks/mode_switch_hooks.py create mode 100644 mmpose/engine/hooks/sync_norm_hook.py create mode 100644 mmpose/engine/schedulers/__init__.py create mode 100644 mmpose/engine/schedulers/quadratic_warmup.py create mode 100644 mmpose/models/backbones/csp_darknet.py create mode 100644 mmpose/models/backbones/cspnext.py create mode 100644 mmpose/models/data_preprocessors/batch_augmentation.py create mode 100644 mmpose/models/heads/hybrid_heads/yoloxpose_head.py create mode 100644 mmpose/models/losses/bbox_loss.py create mode 100644 mmpose/models/necks/channel_mapper.py create mode 100644 mmpose/models/necks/yolox_pafpn.py create mode 100644 mmpose/models/task_modules/__init__.py create mode 100644 mmpose/models/task_modules/assigners/__init__.py create mode 100644 mmpose/models/task_modules/assigners/metric_calculators.py create mode 100644 mmpose/models/task_modules/assigners/sim_ota_assigner.py create mode 100644 mmpose/models/task_modules/prior_generators/__init__.py create mode 100644 mmpose/models/task_modules/prior_generators/mlvl_point_generator.py create mode 100644 mmpose/models/utils/csp_layer.py create mode 100644 mmpose/models/utils/misc.py create mode 100644 mmpose/structures/bbox/bbox_overlaps.py create mode 100644 mmpose/utils/dist_utils.py create mode 100644 tests/test_codecs/test_annotation_processors.py create mode 100644 tests/test_datasets/test_transforms/test_mix_img_transform.py create mode 100644 tests/test_engine/test_hooks/test_mode_switch_hooks.py create mode 100644 tests/test_engine/test_hooks/test_sync_norm_hook.py create mode 100644 tests/test_engine/test_schedulers/test_quadratic_warmup.py create mode 100644 tests/test_models/test_backbones/test_csp_darknet.py create mode 100644 tests/test_models/test_data_preprocessors/test_data_preprocessor.py create mode 100644 tests/test_models/test_necks/test_yolox_pafpn.py create mode 100644 tests/test_structures/test_bbox/test_bbox_overlaps.py create mode 100644 tests/test_structures/test_bbox/test_bbox_transforms.py create mode 100644 tests/test_structures/test_keypoint/test_keypoint_transforms.py diff --git a/configs/body_2d_keypoint/yoloxpose/README.md b/configs/body_2d_keypoint/yoloxpose/README.md new file mode 100644 index 0000000000..8195b1e236 --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/README.md @@ -0,0 +1,22 @@ +# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss + + + +
+YOLO-Pose (CVPRW'2022) + +```bibtex +@inproceedings{maji2022yolo, + title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss}, + author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2637--2646}, + year={2022} +} +``` + +
+ +YOLO-Pose is a bottom-up pose estimation approach that simultaneously detects all person instances and regresses keypoint locations in a single pass. + +We implement **YOLOX-Pose** based on the **YOLOX** object detection framework and inherits the benefits of unified pose estimation and object detection from YOLO-pose. To predict keypoint locations more accurately, separate branches with adaptive convolutions are used to regress the offsets for different joints. This allows optimizing the feature extraction for each keypoint. diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md new file mode 100644 index 0000000000..264673d53d --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md @@ -0,0 +1,59 @@ + + +
+YOLO-Pose (CVPRW'2022) + +```bibtex +@inproceedings{maji2022yolo, + title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss}, + author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2637--2646}, + year={2022} +} +``` + +
+ + + +
+YOLOX + +```bibtex +@article{ge2021yolox, + title={Yolox: Exceeding yolo series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [yoloxpose_tiny](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py) | 416x416 | 0.527 | 0.794 | 0.557 | 0.577 | 0.843 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-20230829.json) | +| [yoloxpose_s](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py) | 640x640 | 0.642 | 0.873 | 0.702 | 0.688 | 0.912 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-20230829.json) | +| [yoloxpose_m](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py) | 640x640 | 0.697 | 0.903 | 0.766 | 0.739 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-20230829.json) | +| [yoloxpose_l](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py) | 640x640 | 0.714 | 0.906 | 0.785 | 0.756 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-20230829.json) | diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml new file mode 100644 index 0000000000..cd745f39a2 --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: YOLOXPose + Paper: + Title: 'YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss' + URL: https://arxiv.org/abs/2204.06806 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/yolopose.md +Models: +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py + In Collection: YOLOXPose + Metadata: + Architecture: &id001 + - YOLOXPose + Training Data: COCO + Name: yoloxpose_tiny_4xb64-300e_coco-416 + Results: + - Dataset: COCO + Metrics: + AP: 0.527 + AP@0.5: 0.794 + AP@0.75: 0.557 + AR: 0.577 + AR@0.5: 0.843 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_s_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.642 + AP@0.5: 0.873 + AP@0.75: 0.702 + AR: 0.688 + AR@0.5: 0.912 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_m_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.697 + AP@0.5: 0.903 + AP@0.75: 0.766 + AR: 0.739 + AR@0.5: 0.933 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_l_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.714 + AP@0.5: 0.906 + AP@0.75: 0.785 + AR: 0.756 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py new file mode 100644 index 0000000000..95a012bd6b --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py @@ -0,0 +1,17 @@ +_base_ = './yolopose_s_8xb32-300e_coco-640.py' + +widen_factor = 1 +deepen_factor = 1 +checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \ + 'l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth' + +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict( + in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3), + head=dict(head_module_cfg=dict(widen_factor=widen_factor))) diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py new file mode 100644 index 0000000000..06eb0322e4 --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py @@ -0,0 +1,16 @@ +_base_ = './yolopose_s_8xb32-300e_coco-640.py' + +widen_factor = 0.75 +deepen_factor = 0.67 +checkpoint = 'https://download.openmmlab.com/mmpose/v1/pretrained_models/' \ + 'yolox_m_8x8_300e_coco_20230829.pth' + +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + head=dict(head_module_cfg=dict(widen_factor=widen_factor))) diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py new file mode 100644 index 0000000000..635d243397 --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py @@ -0,0 +1,266 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict( + _delete_=True, + type='EpochBasedTrainLoop', + max_epochs=300, + val_interval=10, + dynamic_intervals=[(280, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + ), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=280, end=300), +] + +# model +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_' + '20211121_095711-4592a793.pth', + prefix='backbone.', + )), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + head=dict( + type='YOLOXPoseHead', + num_keypoints=17, + featmap_strides=(8, 16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + prior_generator=dict( + type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]), + assigner=dict(type='SimOTAAssigner', dynamic_k_indicator='oks'), + overlaps_power=0.5, + loss_cls=dict(type='BCELoss', reduction='sum', loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj=dict( + type='BCELoss', + use_target_weight=True, + reduction='sum', + loss_weight=1.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo='configs/_base_/datasets/coco.py', + norm_target_weight=True, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + score_thr=0.001, + nms_thr=0.65, + )) + +# data +input_size = (640, 640) +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + filter_cfg=dict(filter_empty_gt=False, min_size=32), + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_coco) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py new file mode 100644 index 0000000000..f918e8b16f --- /dev/null +++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py @@ -0,0 +1,77 @@ +_base_ = './yolopose_s_8xb32-300e_coco-640.py' + +# model settings +widen_factor = 0.375 +deepen_factor = 0.33 +checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \ + 'tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth' + +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=1), + ]), + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict( + in_channels=[96, 192, 384], + out_channels=96, + ), + head=dict(head_module_cfg=dict(widen_factor=widen_factor), )) + +# dataset settings +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=_base_.input_size, + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=_base_.input_size, + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=_base_.codec), + dict( + type='PackPoseInputs', + extra_mapping_labels={ + 'bbox': 'bboxes', + 'bbox_labels': 'labels', + 'keypoints': 'keypoints', + 'keypoints_visible': 'keypoints_visible', + 'area': 'areas' + }), +] +train_dataloader = dict( + batch_size=64, dataset=dict(pipeline=train_pipeline_stage1)) + +input_size = (416, 416) +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict(dataset=dict(pipeline=val_pipeline, )) +test_dataloader = val_dataloader diff --git a/docs/src/papers/algorithms/yolopose.md b/docs/src/papers/algorithms/yolopose.md new file mode 100644 index 0000000000..fe1f41a804 --- /dev/null +++ b/docs/src/papers/algorithms/yolopose.md @@ -0,0 +1,30 @@ +# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss + + + +
+YOLO-Pose (CVPRW'2022) + +```bibtex +@inproceedings{maji2022yolo, + title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss}, + author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2637--2646}, + year={2022} +} +``` + +
+ +## Abstract + + + +We introduce YOLO-pose, a novel heatmap-free approach for joint detection, and 2D multi-person pose estimation in an image based on the popular YOLO object detection framework. Existing heatmap based two-stage approaches are sub-optimal as they are not end-to-end trainable and training relies on a surrogate L1 loss that is not equivalent to maximizing the evaluation metric, i.e. Object Keypoint Similarity (OKS). Our framework allows us to train the model end-to-end and optimize the OKS metric itself. The proposed model learns to jointly detect bounding boxes for multiple persons and their corresponding 2D poses in a single forward pass and thus bringing in the best of both top-down and bottom-up approaches. Proposed approach doesn't require the postprocessing of bottom-up approaches to group detected keypoints into a skeleton as each bounding box has an associated pose, resulting in an inherent grouping of the keypoints. Unlike top-down approaches, multiple forward passes are done away with since all persons are localized along with their pose in a single inference. YOLO-pose achieves new state-of-the-art results on COCO validation (90.2% AP50) and test-dev set (90.3% AP50), surpassing all existing bottom-up approaches in a single forward pass without flip test, multi-scale testing, or any other test time augmentation. All experiments and results reported in this paper are without any test time augmentation, unlike traditional approaches that use flip-test and multi-scale testing to boost performance. + + + +
+ +
diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py index 1a48b7f851..102a202e7d 100644 --- a/mmpose/codecs/__init__.py +++ b/mmpose/codecs/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .annotation_processors import YOLOXPoseAnnotationProcessor from .associative_embedding import AssociativeEmbedding from .decoupled_heatmap import DecoupledHeatmap from .image_pose_lifting import ImagePoseLifting @@ -16,5 +17,5 @@ 'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel', 'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR', 'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting', - 'MotionBERTLabel' + 'MotionBERTLabel', 'YOLOXPoseAnnotationProcessor' ] diff --git a/mmpose/codecs/annotation_processors.py b/mmpose/codecs/annotation_processors.py new file mode 100644 index 0000000000..7add52c420 --- /dev/null +++ b/mmpose/codecs/annotation_processors.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple + +import numpy as np + +from mmpose.registry import KEYPOINT_CODECS +from .base import BaseKeypointCodec + +INF = 1e6 +NEG_INF = -1e6 + + +class BaseAnnotationProcessor(BaseKeypointCodec): + """Base class for annotation processors.""" + + def decode(self, *args, **kwargs): + pass + + +@KEYPOINT_CODECS.register_module() +class YOLOXPoseAnnotationProcessor(BaseAnnotationProcessor): + """Convert dataset annotations to the input format of YOLOX-Pose. + + This processor expands bounding boxes and converts category IDs to labels. + + Args: + expand_bbox (bool, optional): Whether to expand the bounding box + to include all keypoints. Defaults to False. + input_size (tuple, optional): The size of the input image for the + model, formatted as (h, w). This argument is necessary for the + codec in deployment but is not used indeed. + """ + + auxiliary_encode_keys = {'category_id', 'bbox'} + instance_mapping_table = dict( + bbox='bboxes', + bbox_lables='labels', + keypoints='keypoints', + keypoints_visible='keypoints_visible', + area='areas', + ) + + def __init__(self, + expand_bbox: bool = False, + input_size: Optional[Tuple] = None): + super().__init__() + self.expand_bbox = expand_bbox + + def encode(self, + keypoints: Optional[np.ndarray] = None, + keypoints_visible: Optional[np.ndarray] = None, + bbox: Optional[np.ndarray] = None, + category_id: Optional[List[int]] = None + ) -> Dict[str, np.ndarray]: + """Encode keypoints, bounding boxes, and category IDs. + + Args: + keypoints (np.ndarray, optional): Keypoints array. Defaults + to None. + keypoints_visible (np.ndarray, optional): Visibility array for + keypoints. Defaults to None. + bbox (np.ndarray, optional): Bounding box array. Defaults to None. + category_id (List[int], optional): List of category IDs. Defaults + to None. + + Returns: + Dict[str, np.ndarray]: Encoded annotations. + """ + results = {} + + if self.expand_bbox and bbox is not None: + # Handle keypoints visibility + if keypoints_visible.ndim == 3: + keypoints_visible = keypoints_visible[..., 0] + + # Expand bounding box to include keypoints + kpts_min = keypoints.copy() + kpts_min[keypoints_visible == 0] = INF + bbox[..., :2] = np.minimum(bbox[..., :2], kpts_min.min(axis=1)) + + kpts_max = keypoints.copy() + kpts_max[keypoints_visible == 0] = NEG_INF + bbox[..., 2:] = np.maximum(bbox[..., 2:], kpts_max.max(axis=1)) + + results['bbox'] = bbox + + if category_id is not None: + # Convert category IDs to labels + bbox_labels = np.array(category_id).astype(np.int8) - 1 + results['bbox_labels'] = bbox_labels + + return results diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py index 553191fd43..48bb3fc2a4 100644 --- a/mmpose/datasets/dataset_wrappers.py +++ b/mmpose/datasets/dataset_wrappers.py @@ -109,6 +109,11 @@ def prepare_data(self, idx: int) -> Any: data_info = self.get_data_info(idx) + # the assignment of 'dataset' should not be performed within the + # `get_data_info` function. Otherwise, it can lead to the mixed + # data augmentation process getting stuck. + data_info['dataset'] = self + return self.pipeline(data_info) def get_data_info(self, idx: int) -> dict: @@ -123,6 +128,9 @@ def get_data_info(self, idx: int) -> dict: # Get data sample processed by ``subset.pipeline`` data_info = self.datasets[subset_idx][sample_idx] + if 'dataset' in data_info: + data_info.pop('dataset') + # Add metainfo items that are required in the pipeline and the model metainfo_keys = [ 'upper_body_ids', 'lower_body_ids', 'flip_pairs', diff --git a/mmpose/datasets/datasets/base/base_coco_style_dataset.py b/mmpose/datasets/datasets/base/base_coco_style_dataset.py index f0032aef9e..e0f00de5dc 100644 --- a/mmpose/datasets/datasets/base/base_coco_style_dataset.py +++ b/mmpose/datasets/datasets/base/base_coco_style_dataset.py @@ -2,7 +2,7 @@ import copy import os.path as osp from copy import deepcopy -from itertools import filterfalse, groupby +from itertools import chain, filterfalse, groupby from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np @@ -159,6 +159,14 @@ def prepare_data(self, idx) -> Any: """ data_info = self.get_data_info(idx) + # Mixed image transformations require multiple source images for + # effective blending. Therefore, we assign the 'dataset' field in + # `data_info` to provide these auxiliary images. + # Note: The 'dataset' assignment should not occur within the + # `get_data_info` function, as doing so may cause the mixed image + # transformations to stall or hang. + data_info['dataset'] = self + return self.pipeline(data_info) def get_data_info(self, idx: int) -> dict: @@ -288,6 +296,12 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: else: num_keypoints = np.count_nonzero(keypoints.max(axis=2)) + if 'area' in ann: + area = np.array(ann['area'], dtype=np.float32) + else: + area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None) + area = np.array(area, dtype=np.float32) + data_info = { 'img_id': ann['image_id'], 'img_path': img['img_path'], @@ -296,10 +310,11 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: 'num_keypoints': num_keypoints, 'keypoints': keypoints, 'keypoints_visible': keypoints_visible, + 'area': area, 'iscrowd': ann.get('iscrowd', 0), 'segmentation': ann.get('segmentation', None), 'id': ann['id'], - 'category_id': ann['category_id'], + 'category_id': np.array(ann['category_id']), # store the raw annotation of the instance # it is useful for evaluation without providing ann_file 'raw_ann_info': copy.deepcopy(ann), @@ -365,7 +380,13 @@ def _get_bottomup_data_infos(self, instance_list: List[Dict], if key not in data_info_bu: seq = [d[key] for d in data_infos] if isinstance(seq[0], np.ndarray): - seq = np.concatenate(seq, axis=0) + if seq[0].ndim > 0: + seq = np.concatenate(seq, axis=0) + else: + seq = np.stack(seq, axis=0) + elif isinstance(seq[0], (tuple, list)): + seq = list(chain.from_iterable(seq)) + data_info_bu[key] = seq # The segmentation annotation of invalid objects will be used diff --git a/mmpose/datasets/datasets/body/jhmdb_dataset.py b/mmpose/datasets/datasets/body/jhmdb_dataset.py index 7d72a7ddc5..940a4cd4dc 100644 --- a/mmpose/datasets/datasets/body/jhmdb_dataset.py +++ b/mmpose/datasets/datasets/body/jhmdb_dataset.py @@ -118,6 +118,8 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: keypoints_visible = np.minimum(1, _keypoints[..., 2]) num_keypoints = np.count_nonzero(keypoints.max(axis=2)) + area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None) + category_id = ann.get('category_id', [1] * len(keypoints)) data_info = { 'img_id': ann['image_id'], @@ -127,9 +129,11 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: 'num_keypoints': num_keypoints, 'keypoints': keypoints, 'keypoints_visible': keypoints_visible, + 'area': np.array(area, dtype=np.float32), 'iscrowd': ann.get('iscrowd', 0), 'segmentation': ann.get('segmentation', None), 'id': ann['id'], + 'category_id': category_id, } return data_info diff --git a/mmpose/datasets/datasets/body/mpii_dataset.py b/mmpose/datasets/datasets/body/mpii_dataset.py index bdb3797a54..5490f6f0dd 100644 --- a/mmpose/datasets/datasets/body/mpii_dataset.py +++ b/mmpose/datasets/datasets/body/mpii_dataset.py @@ -184,6 +184,12 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: keypoints = np.array(ann['joints']).reshape(1, -1, 2) keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) + x1, y1, x2, y2 = np.split(bbox, axis=1, indices_or_sections=4) + area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None) + area = area[..., 0].astype(np.float32) + + category_id = ann.get('category_id', [1] * len(bbox)) + instance_info = { 'id': ann_id, 'img_id': int(ann['image'].split('.')[0]), @@ -194,6 +200,8 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: 'bbox_score': np.ones(1, dtype=np.float32), 'keypoints': keypoints, 'keypoints_visible': keypoints_visible, + 'area': area, + 'category_id': category_id, } if self.headbox_file: diff --git a/mmpose/datasets/transforms/__init__.py b/mmpose/datasets/transforms/__init__.py index 7ccbf7dac2..46ca6c749e 100644 --- a/mmpose/datasets/transforms/__init__.py +++ b/mmpose/datasets/transforms/__init__.py @@ -1,13 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. from .bottomup_transforms import (BottomupGetHeatmapMask, BottomupRandomAffine, BottomupResize) -from .common_transforms import (Albumentation, GenerateTarget, - GetBBoxCenterScale, PhotometricDistortion, - RandomBBoxTransform, RandomFlip, - RandomHalfBody) +from .common_transforms import (Albumentation, FilterAnnotations, + GenerateTarget, GetBBoxCenterScale, + PhotometricDistortion, RandomBBoxTransform, + RandomFlip, RandomHalfBody, YOLOXHSVRandomAug) from .converting import KeypointConverter from .formatting import PackPoseInputs from .loading import LoadImage +from .mix_img_transforms import Mosaic, YOLOXMixUp from .pose3d_transforms import RandomFlipAroundRoot from .topdown_transforms import TopdownAffine @@ -16,5 +17,6 @@ 'RandomHalfBody', 'TopdownAffine', 'Albumentation', 'PhotometricDistortion', 'PackPoseInputs', 'LoadImage', 'BottomupGetHeatmapMask', 'BottomupRandomAffine', 'BottomupResize', - 'GenerateTarget', 'KeypointConverter', 'RandomFlipAroundRoot' + 'GenerateTarget', 'KeypointConverter', 'RandomFlipAroundRoot', + 'FilterAnnotations', 'YOLOXHSVRandomAug', 'YOLOXMixUp', 'Mosaic' ] diff --git a/mmpose/datasets/transforms/bottomup_transforms.py b/mmpose/datasets/transforms/bottomup_transforms.py index c31e0ae17d..5ef2fa5838 100644 --- a/mmpose/datasets/transforms/bottomup_transforms.py +++ b/mmpose/datasets/transforms/bottomup_transforms.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional, Tuple +from functools import partial +from typing import Dict, List, Optional, Tuple, Union import cv2 import numpy as np @@ -10,7 +11,10 @@ from scipy.stats import truncnorm from mmpose.registry import TRANSFORMS -from mmpose.structures.bbox import get_udp_warp_matrix, get_warp_matrix +from mmpose.structures.bbox import (bbox_clip_border, bbox_corner2xyxy, + bbox_xyxy2corner, get_pers_warp_matrix, + get_udp_warp_matrix, get_warp_matrix) +from mmpose.structures.keypoint import keypoint_clip_border @TRANSFORMS.register_module() @@ -31,6 +35,10 @@ class BottomupGetHeatmapMask(BaseTransform): - heatmap_mask """ + def __init__(self, get_invalid: bool = False): + super().__init__() + self.get_invalid = get_invalid + def _segs_to_mask(self, segs: list, img_shape: Tuple[int, int]) -> np.ndarray: """Calculate mask from object segmentations. @@ -83,10 +91,12 @@ def transform(self, results: Dict) -> Optional[dict]: invalid_segs = results.get('invalid_segs', []) img_shape = results['img_shape'] # (img_h, img_w) input_size = results['input_size'] + mask = self._segs_to_mask(invalid_segs, img_shape) - # Calculate the mask of the valid region by negating the segmentation - # mask of invalid objects - mask = 1 - self._segs_to_mask(invalid_segs, img_shape) + if not self.get_invalid: + # Calculate the mask of the valid region by negating the + # segmentation mask of invalid objects + mask = np.logical_not(mask) # Apply an affine transform to the mask if the image has been # transformed @@ -176,7 +186,7 @@ class BottomupRandomAffine(BaseTransform): """ def __init__(self, - input_size: Tuple[int, int], + input_size: Optional[Tuple[int, int]] = None, shift_factor: float = 0.2, shift_prob: float = 1., scale_factor: Tuple[float, float] = (0.75, 1.5), @@ -184,9 +194,21 @@ def __init__(self, scale_type: str = 'short', rotate_factor: float = 30., rotate_prob: float = 1, - use_udp: bool = False) -> None: + shear_factor: float = 2.0, + shear_prob: float = 1.0, + use_udp: bool = False, + pad_val: Union[float, Tuple[float]] = 0, + border: Tuple[int, int] = (0, 0), + distribution='trunc_norm', + transform_mode='affine', + bbox_keep_corner: bool = True, + clip_border: bool = False) -> None: super().__init__() + assert transform_mode in ('affine', 'affine_udp', 'perspective'), \ + f'the argument transform_mode should be either \'affine\', ' \ + f'\'affine_udp\' or \'perspective\', but got \'{transform_mode}\'' + self.input_size = input_size self.shift_factor = shift_factor self.shift_prob = shift_prob @@ -195,14 +217,39 @@ def __init__(self, self.scale_type = scale_type self.rotate_factor = rotate_factor self.rotate_prob = rotate_prob + self.shear_factor = shear_factor + self.shear_prob = shear_prob + self.use_udp = use_udp + self.distribution = distribution + self.clip_border = clip_border + self.bbox_keep_corner = bbox_keep_corner - @staticmethod - def _truncnorm(low: float = -1., - high: float = 1., - size: tuple = ()) -> np.ndarray: - """Sample from a truncated normal distribution.""" - return truncnorm.rvs(low, high, size=size).astype(np.float32) + self.transform_mode = transform_mode + + if isinstance(pad_val, (int, float)): + pad_val = (pad_val, pad_val, pad_val) + + if 'affine' in transform_mode: + self._transform = partial( + cv2.warpAffine, flags=cv2.INTER_LINEAR, borderValue=pad_val) + else: + self._transform = partial(cv2.warpPerspective, borderValue=pad_val) + + def _random(self, + low: float = -1., + high: float = 1., + size: tuple = ()) -> np.ndarray: + if self.distribution == 'trunc_norm': + """Sample from a truncated normal distribution.""" + return truncnorm.rvs(low, high, size=size).astype(np.float32) + elif self.distribution == 'uniform': + x = np.random.rand(*size) + return x * (high - low) + low + else: + raise ValueError(f'the argument `distribution` should be either' + f'\'trunc_norn\' or \'uniform\', but got ' + f'{self.distribution}.') def _fix_aspect_ratio(self, scale: np.ndarray, aspect_ratio: float): """Extend the scale to match the given aspect ratio. @@ -243,7 +290,7 @@ def _get_transform_params(self) -> Tuple: """ # get offset if np.random.rand() < self.shift_prob: - offset = self._truncnorm(size=(2, )) * self.shift_factor + offset = self._random(size=(2, )) * self.shift_factor else: offset = np.zeros((2, ), dtype=np.float32) @@ -251,17 +298,24 @@ def _get_transform_params(self) -> Tuple: if np.random.rand() < self.scale_prob: scale_min, scale_max = self.scale_factor scale = scale_min + (scale_max - scale_min) * ( - self._truncnorm(size=(1, )) + 1) / 2 + self._random(size=(1, )) + 1) / 2 else: scale = np.ones(1, dtype=np.float32) # get rotation if np.random.rand() < self.rotate_prob: - rotate = self._truncnorm() * self.rotate_factor + rotate = self._random() * self.rotate_factor else: rotate = 0 - return offset, scale, rotate + # get shear + if 'perspective' in self.transform_mode and np.random.rand( + ) < self.shear_prob: + shear = self._random(size=(2, )) * self.shear_factor + else: + shear = np.zeros((2, ), dtype=np.float32) + + return offset, scale, rotate, shear def transform(self, results: Dict) -> Optional[dict]: """The transform function of :class:`BottomupRandomAffine` to perform @@ -277,45 +331,77 @@ def transform(self, results: Dict) -> Optional[dict]: dict: Result dict with images distorted. """ - img_h, img_w = results['img_shape'] + img_h, img_w = results['img_shape'][:2] w, h = self.input_size - offset_rate, scale_rate, rotate = self._get_transform_params() - offset = offset_rate * [img_w, img_h] - scale = scale_rate * [img_w, img_h] - # adjust the scale to match the target aspect ratio - scale = self._fix_aspect_ratio(scale, aspect_ratio=w / h) - - if self.use_udp: - center = np.array([(img_w - 1.0) / 2, (img_h - 1.0) / 2], - dtype=np.float32) - warp_mat = get_udp_warp_matrix( - center=center + offset, - scale=scale, - rot=rotate, - output_size=(w, h)) + offset_rate, scale_rate, rotate, shear = self._get_transform_params() + + if 'affine' in self.transform_mode: + offset = offset_rate * [img_w, img_h] + scale = scale_rate * [img_w, img_h] + # adjust the scale to match the target aspect ratio + scale = self._fix_aspect_ratio(scale, aspect_ratio=w / h) + + if self.transform_mode == 'affine_udp': + center = np.array([(img_w - 1.0) / 2, (img_h - 1.0) / 2], + dtype=np.float32) + warp_mat = get_udp_warp_matrix( + center=center + offset, + scale=scale, + rot=rotate, + output_size=(w, h)) + else: + center = np.array([img_w / 2, img_h / 2], dtype=np.float32) + warp_mat = get_warp_matrix( + center=center + offset, + scale=scale, + rot=rotate, + output_size=(w, h)) + else: - center = np.array([img_w / 2, img_h / 2], dtype=np.float32) - warp_mat = get_warp_matrix( - center=center + offset, - scale=scale, + offset = offset_rate * [w, h] + center = np.array([w / 2, h / 2], dtype=np.float32) + warp_mat = get_pers_warp_matrix( + center=center, + translate=offset, + scale=scale_rate[0], rot=rotate, - output_size=(w, h)) + shear=shear) # warp image and keypoints - results['img'] = cv2.warpAffine( - results['img'], warp_mat, (int(w), int(h)), flags=cv2.INTER_LINEAR) + results['img'] = self._transform(results['img'], warp_mat, + (int(w), int(h))) if 'keypoints' in results: # Only transform (x, y) coordinates - results['keypoints'][..., :2] = cv2.transform( - results['keypoints'][..., :2], warp_mat) + kpts = cv2.transform(results['keypoints'], warp_mat) + if kpts.shape[-1] == 3: + kpts = kpts[..., :2] / kpts[..., 2:3] + results['keypoints'] = kpts + + if self.clip_border: + results['keypoints'], results[ + 'keypoints_visible'] = keypoint_clip_border( + results['keypoints'], results['keypoints_visible'], + (w, h)) if 'bbox' in results: - bbox = np.tile(results['bbox'], 2).reshape(-1, 4, 2) - # corner order: left_top, left_bottom, right_top, right_bottom - bbox[:, 1:3, 0] = bbox[:, 0:2, 0] - results['bbox'] = cv2.transform(bbox, warp_mat).reshape(-1, 8) + bbox = bbox_xyxy2corner(results['bbox']) + bbox = cv2.transform(bbox, warp_mat) + if bbox.shape[-1] == 3: + bbox = bbox[..., :2] / bbox[..., 2:3] + if not self.bbox_keep_corner: + bbox = bbox_corner2xyxy(bbox) + if self.clip_border: + bbox = bbox_clip_border(bbox, (w, h)) + results['bbox'] = bbox + + if 'area' in results: + warp_mat_for_area = warp_mat + if warp_mat.shape[0] == 2: + aux_row = np.array([[0.0, 0.0, 1.0]], dtype=warp_mat.dtype) + warp_mat_for_area = np.concatenate((warp_mat, aux_row)) + results['area'] *= np.linalg.det(warp_mat_for_area) results['input_size'] = self.input_size results['warp_mat'] = warp_mat @@ -380,6 +466,7 @@ def __init__(self, aug_scales: Optional[List[float]] = None, size_factor: int = 32, resize_mode: str = 'fit', + pad_val: tuple = (0, 0, 0), use_udp: bool = False): super().__init__() @@ -388,6 +475,7 @@ def __init__(self, self.resize_mode = resize_mode self.size_factor = size_factor self.use_udp = use_udp + self.pad_val = pad_val @staticmethod def _ceil_to_multiple(size: Tuple[int, int], base: int): @@ -496,7 +584,11 @@ def transform(self, results: Dict) -> Optional[dict]: output_size=padded_input_size) _img = cv2.warpAffine( - img, warp_mat, padded_input_size, flags=cv2.INTER_LINEAR) + img, + warp_mat, + padded_input_size, + flags=cv2.INTER_LINEAR, + borderValue=self.pad_val) imgs.append(_img) diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py index d8591ab094..98aed11683 100644 --- a/mmpose/datasets/transforms/common_transforms.py +++ b/mmpose/datasets/transforms/common_transforms.py @@ -3,6 +3,7 @@ from copy import deepcopy from typing import Dict, List, Optional, Sequence, Tuple, Union +import cv2 import mmcv import mmengine import numpy as np @@ -957,6 +958,7 @@ def transform(self, results: Dict) -> Optional[dict]: if keypoints_visible.ndim == 3 and keypoints_visible.shape[2] == 2: keypoints_visible, keypoints_visible_weights = \ keypoints_visible[..., 0], keypoints_visible[..., 1] + results['keypoints_visible'] = keypoints_visible results['keypoints_visible_weights'] = keypoints_visible_weights # Encoded items from the encoder(s) will be updated into the results. @@ -1074,3 +1076,178 @@ def __repr__(self) -> str: repr_str += ('use_dataset_keypoint_weights=' f'{self.use_dataset_keypoint_weights})') return repr_str + + +@TRANSFORMS.register_module() +class YOLOXHSVRandomAug(BaseTransform): + """Apply HSV augmentation to image sequentially. It is referenced from + https://github.com/Megvii- + BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + hue_delta (int): delta of hue. Defaults to 5. + saturation_delta (int): delta of saturation. Defaults to 30. + value_delta (int): delat of value. Defaults to 30. + """ + + def __init__(self, + hue_delta: int = 5, + saturation_delta: int = 30, + value_delta: int = 30) -> None: + self.hue_delta = hue_delta + self.saturation_delta = saturation_delta + self.value_delta = value_delta + + @cache_randomness + def _get_hsv_gains(self): + hsv_gains = np.random.uniform(-1, 1, 3) * [ + self.hue_delta, self.saturation_delta, self.value_delta + ] + # random selection of h, s, v + hsv_gains *= np.random.randint(0, 2, 3) + # prevent overflow + hsv_gains = hsv_gains.astype(np.int16) + return hsv_gains + + def transform(self, results: dict) -> dict: + img = results['img'] + hsv_gains = self._get_hsv_gains() + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16) + + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255) + cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) + + results['img'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class FilterAnnotations(BaseTransform): + """Eliminate undesirable annotations based on specific conditions. + + This class is designed to sift through annotations by examining multiple + factors such as the size of the bounding box, the visibility of keypoints, + and the overall area. Users can fine-tune the criteria to filter out + instances that have excessively small bounding boxes, insufficient area, + or an inadequate number of visible keypoints. + + Required Keys: + + - bbox (np.ndarray) (optional) + - area (np.int64) (optional) + - keypoints_visible (np.ndarray) (optional) + + Modified Keys: + + - bbox (optional) + - bbox_score (optional) + - category_id (optional) + - keypoints (optional) + - keypoints_visible (optional) + - area (optional) + + Args: + min_gt_bbox_wh (tuple[float]): Minimum width and height of ground + truth boxes. Default: (1., 1.) + min_gt_area (int): Minimum foreground area of instances. + Default: 1 + min_kpt_vis (int): Minimum number of visible keypoints. Default: 1 + by_box (bool): Filter instances with bounding boxes not meeting the + min_gt_bbox_wh threshold. Default: False + by_area (bool): Filter instances with area less than min_gt_area + threshold. Default: False + by_kpt (bool): Filter instances with keypoints_visible not meeting the + min_kpt_vis threshold. Default: True + keep_empty (bool): Whether to return None when it + becomes an empty bbox after filtering. Defaults to True. + """ + + def __init__(self, + min_gt_bbox_wh: Tuple[int, int] = (1, 1), + min_gt_area: int = 1, + min_kpt_vis: int = 1, + by_box: bool = False, + by_area: bool = False, + by_kpt: bool = True, + keep_empty: bool = True) -> None: + + assert by_box or by_kpt or by_area + self.min_gt_bbox_wh = min_gt_bbox_wh + self.min_gt_area = min_gt_area + self.min_kpt_vis = min_kpt_vis + self.by_box = by_box + self.by_area = by_area + self.by_kpt = by_kpt + self.keep_empty = keep_empty + + def transform(self, results: dict) -> Union[dict, None]: + """Transform function to filter annotations. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + assert 'keypoints' in results + kpts = results['keypoints'] + if kpts.shape[0] == 0: + return results + + tests = [] + if self.by_box and 'bbox' in results: + bbox = results['bbox'] + tests.append( + ((bbox[..., 2] - bbox[..., 0] > self.min_gt_bbox_wh[0]) & + (bbox[..., 3] - bbox[..., 1] > self.min_gt_bbox_wh[1]))) + if self.by_area and 'area' in results: + area = results['area'] + tests.append(area >= self.min_gt_area) + if self.by_kpt: + kpts_vis = results['keypoints_visible'] + if kpts_vis.ndim == 3: + kpts_vis = kpts_vis[..., 0] + tests.append(kpts_vis.sum(axis=1) >= self.min_kpt_vis) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + if not keep.any(): + if self.keep_empty: + return None + + keys = ('bbox', 'bbox_score', 'category_id', 'keypoints', + 'keypoints_visible', 'area') + for key in keys: + if key in results: + results[key] = results[key][keep] + + return results + + def __repr__(self): + return (f'{self.__class__.__name__}(' + f'min_gt_bbox_wh={self.min_gt_bbox_wh}, ' + f'min_gt_area={self.min_gt_area}, ' + f'min_kpt_vis={self.min_kpt_vis}, ' + f'by_box={self.by_box}, ' + f'by_area={self.by_area}, ' + f'by_kpt={self.by_kpt}, ' + f'keep_empty={self.keep_empty})') diff --git a/mmpose/datasets/transforms/mix_img_transforms.py b/mmpose/datasets/transforms/mix_img_transforms.py new file mode 100644 index 0000000000..84d03ea5a2 --- /dev/null +++ b/mmpose/datasets/transforms/mix_img_transforms.py @@ -0,0 +1,501 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta +from collections import defaultdict +from typing import Optional, Sequence, Tuple + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmengine.dataset.base_dataset import Compose +from numpy import random + +from mmpose.registry import TRANSFORMS +from mmpose.structures import (bbox_clip_border, flip_bbox, flip_keypoints, + keypoint_clip_border) + + +class MixImageTransform(BaseTransform, metaclass=ABCMeta): + """Abstract base class for mixup-style image data augmentation. + + Args: + pre_transform (Optional[Sequence[str]]): A sequence of transform + to be applied before mixup. Defaults to None. + prob (float): Probability of applying the mixup transformation. + Defaults to 1.0. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0): + + self.prob = prob + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + def transform(self, results: dict) -> dict: + """Transform the input data dictionary using mixup-style augmentation. + + Args: + results (dict): A dictionary containing input data. + """ + + if random.uniform(0, 1) < self.prob: + + dataset = results.pop('dataset', None) + + results['mixed_data_list'] = self._get_mixed_data_list(dataset) + results = self.apply_mix(results) + + if 'mixed_data_list' in results: + results.pop('mixed_data_list') + + results['dataset'] = dataset + + return results + + def _get_mixed_data_list(self, dataset): + """Get a list of mixed data samples from the dataset. + + Args: + dataset: The dataset from which to sample the mixed data. + + Returns: + List[dict]: A list of dictionaries containing mixed data samples. + """ + indexes = [ + random.randint(0, len(dataset)) for _ in range(self.num_aux_image) + ] + + mixed_data_list = [ + copy.deepcopy(dataset.get_data_info(index)) for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mixed_data_list): + data.update({'dataset': dataset}) + _results = self.pre_transform(data) + _results.pop('dataset') + mixed_data_list[i] = _results + + return mixed_data_list + + +@TRANSFORMS.register_module() +class Mosaic(MixImageTransform): + """Mosaic augmentation. This transformation takes four input images and + combines them into a single output image using the mosaic technique. The + resulting image is composed of parts from each of the four sub-images. The + mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersection of the four images. + 2. Select the top-left image according to the index and randomly sample + three more images from the custom dataset. + 3. If an image is larger than the mosaic patch, it will be cropped. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + Required Keys: + + - img + - bbox (optional) + - bbox_score (optional) + - category_id (optional) + - keypoints (optional) + - keypoints_visible (optional) + - area (optional) + + Modified Keys: + + - img + - bbox (optional) + - bbox_score (optional) + - category_id (optional) + - keypoints (optional) + - keypoints_visible (optional) + - area (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + pad_val (int): Pad value. Defaults to 114. + pre_transform (Optional[Sequence[str]]): A sequence of transform + to be applied before mixup. Defaults to None. + prob (float): Probability of applying the mixup transformation. + Defaults to 1.0. + """ + + num_aux_image = 3 + + def __init__( + self, + img_scale: Tuple[int, int] = (640, 640), + center_range: Tuple[float, float] = (0.5, 1.5), + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + ): + + super().__init__(pre_transform=pre_transform, prob=prob) + + self.img_scale = img_scale + self.center_range = center_range + self.pad_val = pad_val + + def apply_mix(self, results: dict) -> dict: + """Apply mosaic augmentation to the input data.""" + + assert 'mixed_data_list' in results + mixed_data_list = results.pop('mixed_data_list') + assert len(mixed_data_list) == self.num_aux_image + + img, annos = self._create_mosaic_image(results, mixed_data_list) + bboxes = annos['bboxes'] + kpts = annos['keypoints'] + kpts_vis = annos['keypoints_visible'] + + bboxes = bbox_clip_border(bboxes, (2 * self.img_scale[0], + 2 * self.img_scale[1])) + kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis, + (2 * self.img_scale[0], + 2 * self.img_scale[1])) + + results['img'] = img + results['img_shape'] = img.shape + results['bbox'] = bboxes + results['category_id'] = annos['category_id'] + results['bbox_score'] = annos['bbox_scores'] + results['keypoints'] = kpts + results['keypoints_visible'] = kpts_vis + results['area'] = annos['area'] + + return results + + def _create_mosaic_image(self, results, mixed_data_list): + """Create the mosaic image and corresponding annotations by combining + four input images.""" + + # init mosaic image + img_scale_w, img_scale_h = self.img_scale + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + + # calculate mosaic center + center = (int(random.uniform(*self.center_range) * img_scale_w), + int(random.uniform(*self.center_range) * img_scale_h)) + + annos = defaultdict(list) + locs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for loc, data in zip(locs, (results, *mixed_data_list)): + + # process image + img = data['img'] + h, w = img.shape[:2] + scale_ratio = min(img_scale_h / h, img_scale_w / w) + img = mmcv.imresize(img, + (int(w * scale_ratio), int(h * scale_ratio))) + + # paste + paste_coord, crop_coord = self._mosaic_combine( + loc, center, img.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img[y1_c:y2_c, x1_c:x2_c] + padw = x1_p - x1_c + padh = y1_p - y1_c + + # merge annotations + if 'bbox' in data: + bboxes = data['bbox'] + + # rescale & translate + bboxes *= scale_ratio + bboxes[..., ::2] += padw + bboxes[..., 1::2] += padh + + annos['bboxes'].append(bboxes) + annos['bbox_scores'].append(data['bbox_score']) + annos['category_id'].append(data['category_id']) + + if 'keypoints' in data: + kpts = data['keypoints'] + + # rescale & translate + kpts *= scale_ratio + kpts[..., 0] += padw + kpts[..., 1] += padh + + annos['keypoints'].append(kpts) + annos['keypoints_visible'].append(data['keypoints_visible']) + + if 'area' in data: + annos['area'].append(data['area'] * scale_ratio**2) + + for key in annos: + annos[key] = np.concatenate(annos[key]) + return mosaic_img, annos + + def _mosaic_combine( + self, loc: str, center: Tuple[float, float], img_shape: Tuple[int, int] + ) -> Tuple[Tuple[int, int, int, int], Tuple[int, int, int, int]]: + """Determine the overall coordinates of the mosaic image and the + specific coordinates of the cropped sub-image.""" + + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + + x1, y1, x2, y2 = 0, 0, 0, 0 + cx, cy = center + w, h = img_shape + + if loc == 'top_left': + x1, y1, x2, y2 = max(cx - w, 0), max(cy - h, 0), cx, cy + crop_coord = w - (x2 - x1), h - (y2 - y1), w, h + elif loc == 'top_right': + x1, y1, x2, y2 = cx, max(cy - h, 0), min(cx + w, + self.img_scale[0] * 2), cy + crop_coord = 0, h - (y2 - y1), min(w, x2 - x1), h + elif loc == 'bottom_left': + x1, y1, x2, y2 = max(cx - w, + 0), cy, cx, min(self.img_scale[1] * 2, cy + h) + crop_coord = w - (x2 - x1), 0, w, min(y2 - y1, h) + else: + x1, y1, x2, y2 = cx, cy, min(cx + w, self.img_scale[0] * + 2), min(self.img_scale[1] * 2, cy + h) + crop_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h) + + return (x1, y1, x2, y2), crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_range={self.center_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOXMixUp(MixImageTransform): + """MixUp data augmentation for YOLOX. This transform combines two images + through mixup to enhance the dataset's diversity. + + Mixup Transform Steps: + + 1. A random image is chosen from the dataset and placed in the + top-left corner of the target image (after padding and resizing). + 2. The target of the mixup transform is obtained by taking the + weighted average of the mixup image and the original image. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + Required Keys: + + - img + - bbox (optional) + - bbox_score (optional) + - category_id (optional) + - keypoints (optional) + - keypoints_visible (optional) + - area (optional) + + Modified Keys: + + - img + - bbox (optional) + - bbox_score (optional) + - category_id (optional) + - keypoints (optional) + - keypoints_visible (optional) + - area (optional) + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + pre_transform (Optional[Sequence[str]]): A sequence of transform + to be applied before mixup. Defaults to None. + prob (float): Probability of applying the mixup transformation. + Defaults to 1.0. + """ + num_aux_image = 1 + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0): + assert isinstance(img_scale, tuple) + super().__init__(pre_transform=pre_transform, prob=prob) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def apply_mix(self, results: dict) -> dict: + """YOLOX MixUp transform function.""" + + assert 'mixed_data_list' in results + mixed_data_list = results.pop('mixed_data_list') + assert len(mixed_data_list) == self.num_aux_image + + if mixed_data_list[0]['keypoints'].shape[0] == 0: + return results + + img, annos = self._create_mixup_image(results, mixed_data_list) + bboxes = annos['bboxes'] + kpts = annos['keypoints'] + kpts_vis = annos['keypoints_visible'] + + h, w = img.shape[:2] + bboxes = bbox_clip_border(bboxes, (w, h)) + kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis, (w, h)) + + results['img'] = img.astype(np.uint8) + results['img_shape'] = img.shape + results['bbox'] = bboxes + results['category_id'] = annos['category_id'] + results['bbox_score'] = annos['bbox_scores'] + results['keypoints'] = kpts + results['keypoints_visible'] = kpts_vis + results['area'] = annos['area'] + + return results + + def _create_mixup_image(self, results, mixed_data_list): + """Create the mixup image and corresponding annotations by combining + two input images.""" + + aux_results = mixed_data_list[0] + aux_img = aux_results['img'] + + # init mixup image + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=aux_img.dtype) * self.pad_val + annos = defaultdict(list) + + # Calculate scale ratio and resize aux_img + scale_ratio = min(self.img_scale[1] / aux_img.shape[0], + self.img_scale[0] / aux_img.shape[1]) + aux_img = mmcv.imresize(aux_img, (int(aux_img.shape[1] * scale_ratio), + int(aux_img.shape[0] * scale_ratio))) + + # Set the resized aux_img in the top-left of out_img + out_img[:aux_img.shape[0], :aux_img.shape[1]] = aux_img + + # random rescale + jit_factor = random.uniform(*self.ratio_range) + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # random flip + is_filp = random.uniform(0, 1) > self.flip_ratio + if is_filp: + out_img = out_img[:, ::-1, :] + + # random crop + ori_img = results['img'] + aux_h, aux_w = out_img.shape[:2] + h, w = ori_img.shape[:2] + padded_img = np.ones((max(aux_h, h), max(aux_w, w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:aux_h, :aux_w] = out_img + + dy = random.randint(0, max(0, padded_img.shape[0] - h) + 1) + dx = random.randint(0, max(0, padded_img.shape[1] - w) + 1) + padded_cropped_img = padded_img[dy:dy + h, dx:dx + w] + + # mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + # merge annotations + # bboxes + bboxes = aux_results['bbox'].copy() + bboxes *= scale_ratio + bboxes = bbox_clip_border(bboxes, (aux_w, aux_h)) + if is_filp: + bboxes = flip_bbox(bboxes, [aux_w, aux_h], 'xyxy') + bboxes[..., ::2] -= dx + bboxes[..., 1::2] -= dy + annos['bboxes'] = [results['bbox'], bboxes] + annos['bbox_scores'] = [ + results['bbox_score'], aux_results['bbox_score'] + ] + annos['category_id'] = [ + results['category_id'], aux_results['category_id'] + ] + + # keypoints + kpts = aux_results['keypoints'] * scale_ratio + kpts, kpts_vis = keypoint_clip_border(kpts, + aux_results['keypoints_visible'], + (aux_w, aux_h)) + if is_filp: + kpts, kpts_vis = flip_keypoints(kpts, kpts_vis, (aux_w, aux_h), + aux_results['flip_indices']) + kpts[..., 0] -= dx + kpts[..., 1] -= dy + annos['keypoints'] = [results['keypoints'], kpts] + annos['keypoints_visible'] = [results['keypoints_visible'], kpts_vis] + annos['area'] = [results['area'], aux_results['area'] * scale_ratio**2] + + for key in annos: + annos[key] = np.concatenate(annos[key]) + + return mixup_img, annos + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val})' + return repr_str diff --git a/mmpose/engine/__init__.py b/mmpose/engine/__init__.py index ac85928986..44f7fa17bc 100644 --- a/mmpose/engine/__init__.py +++ b/mmpose/engine/__init__.py @@ -1,3 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. from .hooks import * # noqa: F401, F403 from .optim_wrappers import * # noqa: F401, F403 +from .schedulers import * # noqa: F401, F403 diff --git a/mmpose/engine/hooks/__init__.py b/mmpose/engine/hooks/__init__.py index abfe762881..2c31ca081c 100644 --- a/mmpose/engine/hooks/__init__.py +++ b/mmpose/engine/hooks/__init__.py @@ -1,6 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .badcase_hook import BadCaseAnalysisHook from .ema_hook import ExpMomentumEMA +from .mode_switch_hooks import YOLOXPoseModeSwitchHook +from .sync_norm_hook import SyncNormHook from .visualization_hook import PoseVisualizationHook -__all__ = ['PoseVisualizationHook', 'ExpMomentumEMA', 'BadCaseAnalysisHook'] +__all__ = [ + 'PoseVisualizationHook', 'ExpMomentumEMA', 'BadCaseAnalysisHook', + 'YOLOXPoseModeSwitchHook', 'SyncNormHook' +] diff --git a/mmpose/engine/hooks/mode_switch_hooks.py b/mmpose/engine/hooks/mode_switch_hooks.py new file mode 100644 index 0000000000..862e36dc0b --- /dev/null +++ b/mmpose/engine/hooks/mode_switch_hooks.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Sequence + +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper +from mmengine.runner import Runner + +from mmpose.registry import HOOKS + + +@HOOKS.register_module() +class YOLOXPoseModeSwitchHook(Hook): + """Switch the mode of YOLOX-Pose during training. + + This hook: + 1) Turns off mosaic and mixup data augmentation. + 2) Uses instance mask to assist positive anchor selection. + 3) Uses auxiliary L1 loss in the head. + + Args: + num_last_epochs (int): The number of last epochs at the end of + training to close the data augmentation and switch to L1 loss. + Defaults to 20. + new_train_dataset (dict): New training dataset configuration that + will be used in place of the original training dataset. Defaults + to None. + new_train_pipeline (Sequence[dict]): New data augmentation pipeline + configuration that will be used in place of the original pipeline + during training. Defaults to None. + """ + + def __init__(self, + num_last_epochs: int = 20, + new_train_dataset: dict = None, + new_train_pipeline: Sequence[dict] = None): + self.num_last_epochs = num_last_epochs + self.new_train_dataset = new_train_dataset + self.new_train_pipeline = new_train_pipeline + + def _modify_dataloader(self, runner: Runner): + """Modify dataloader with new dataset and pipeline configurations.""" + runner.logger.info(f'New Pipeline: {self.new_train_pipeline}') + + train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader) + if self.new_train_dataset: + train_dataloader_cfg.dataset = self.new_train_dataset + if self.new_train_pipeline: + train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline + + new_train_dataloader = Runner.build_dataloader(train_dataloader_cfg) + runner.train_loop.dataloader = new_train_dataloader + runner.logger.info('Recreated the dataloader!') + + def before_train_epoch(self, runner: Runner): + """Close mosaic and mixup augmentation, switch to use L1 loss.""" + epoch = runner.epoch + model = runner.model + if is_model_wrapper(model): + model = model.module + + if epoch + 1 == runner.max_epochs - self.num_last_epochs: + self._modify_dataloader(runner) + runner.logger.info('Added additional reg loss now!') + model.head.use_aux_loss = True diff --git a/mmpose/engine/hooks/sync_norm_hook.py b/mmpose/engine/hooks/sync_norm_hook.py new file mode 100644 index 0000000000..053e4f92af --- /dev/null +++ b/mmpose/engine/hooks/sync_norm_hook.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +from mmengine.dist import all_reduce_dict, get_dist_info +from mmengine.hooks import Hook +from torch import nn + +from mmpose.registry import HOOKS + + +def get_norm_states(module: nn.Module) -> OrderedDict: + """Get the state_dict of batch norms in the module.""" + async_norm_states = OrderedDict() + for name, child in module.named_modules(): + if isinstance(child, nn.modules.batchnorm._NormBase): + for k, v in child.state_dict().items(): + async_norm_states['.'.join([name, k])] = v + return async_norm_states + + +@HOOKS.register_module() +class SyncNormHook(Hook): + """Synchronize Norm states before validation.""" + + def before_val_epoch(self, runner): + """Synchronize normalization statistics.""" + module = runner.model + rank, world_size = get_dist_info() + + if world_size == 1: + return + + norm_states = get_norm_states(module) + if len(norm_states) == 0: + return + + try: + norm_states = all_reduce_dict(norm_states, op='mean') + module.load_state_dict(norm_states, strict=True) + except Exception as e: + runner.logger.warn(f'SyncNormHook failed: {str(e)}') diff --git a/mmpose/engine/schedulers/__init__.py b/mmpose/engine/schedulers/__init__.py new file mode 100644 index 0000000000..01261646fa --- /dev/null +++ b/mmpose/engine/schedulers/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum, + QuadraticWarmupParamScheduler) + +__all__ = [ + 'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum', + 'QuadraticWarmupLR' +] diff --git a/mmpose/engine/schedulers/quadratic_warmup.py b/mmpose/engine/schedulers/quadratic_warmup.py new file mode 100644 index 0000000000..1021797217 --- /dev/null +++ b/mmpose/engine/schedulers/quadratic_warmup.py @@ -0,0 +1,131 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin +from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin +from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler +from torch.optim import Optimizer + +from mmpose.registry import PARAM_SCHEDULERS + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupParamScheduler(_ParamScheduler): + r"""Warm up the parameter value of each parameter group by quadratic + formula: + + .. math:: + + X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base} + + Args: + optimizer (Optimizer): Wrapped optimizer. + param_name (str): Name of the parameter to be adjusted, such as + ``lr``, ``momentum``. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ + + def __init__(self, + optimizer: Optimizer, + param_name: str, + begin: int = 0, + end: int = INF, + last_step: int = -1, + by_epoch: bool = True, + verbose: bool = False): + if end >= INF: + raise ValueError('``end`` must be less than infinity,' + 'Please set ``end`` parameter of ' + '``QuadraticWarmupScheduler`` as the ' + 'number of warmup end.') + self.total_iters = end - begin + super().__init__( + optimizer=optimizer, + param_name=param_name, + begin=begin, + end=end, + last_step=last_step, + by_epoch=by_epoch, + verbose=verbose) + + @classmethod + def build_iter_from_epoch(cls, + *args, + begin=0, + end=INF, + by_epoch=True, + epoch_length=None, + **kwargs): + """Build an iter-based instance of this scheduler from an epoch-based + config.""" + assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \ + 'be converted to iter-based.' + assert epoch_length is not None and epoch_length > 0, \ + f'`epoch_length` must be a positive integer, ' \ + f'but got {epoch_length}.' + by_epoch = False + begin = begin * epoch_length + if end != INF: + end = end * epoch_length + return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs) + + def _get_value(self): + """Compute value using chainable form of the scheduler.""" + if self.last_step == 0: + return [ + base_value * (2 * self.last_step + 1) / self.total_iters**2 + for base_value in self.base_values + ] + + return [ + group[self.param_name] + base_value * + (2 * self.last_step + 1) / self.total_iters**2 + for base_value, group in zip(self.base_values, + self.optimizer.param_groups) + ] + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler): + """Warm up the learning rate of each parameter group by quadratic formula. + + Args: + optimizer (Optimizer): Wrapped optimizer. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupMomentum(MomentumSchedulerMixin, + QuadraticWarmupParamScheduler): + """Warm up the momentum value of each parameter group by quadratic formula. + + Args: + optimizer (Optimizer): Wrapped optimizer. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ diff --git a/mmpose/evaluation/functional/__init__.py b/mmpose/evaluation/functional/__init__.py index 49f243163c..f59f6f5a40 100644 --- a/mmpose/evaluation/functional/__init__.py +++ b/mmpose/evaluation/functional/__init__.py @@ -3,10 +3,11 @@ keypoint_nme, keypoint_pck_accuracy, multilabel_classification_accuracy, pose_pck_accuracy, simcc_pck_accuracy) -from .nms import nms, oks_nms, soft_oks_nms +from .nms import nms, nms_torch, oks_nms, soft_oks_nms __all__ = [ 'keypoint_pck_accuracy', 'keypoint_auc', 'keypoint_nme', 'keypoint_epe', 'pose_pck_accuracy', 'multilabel_classification_accuracy', - 'simcc_pck_accuracy', 'nms', 'oks_nms', 'soft_oks_nms', 'keypoint_mpjpe' + 'simcc_pck_accuracy', 'nms', 'oks_nms', 'soft_oks_nms', 'keypoint_mpjpe', + 'nms_torch' ] diff --git a/mmpose/evaluation/functional/nms.py b/mmpose/evaluation/functional/nms.py index eed4e5cf73..801fee7764 100644 --- a/mmpose/evaluation/functional/nms.py +++ b/mmpose/evaluation/functional/nms.py @@ -7,6 +7,10 @@ from typing import List, Optional import numpy as np +import torch +from torch import Tensor + +from mmpose.structures.bbox import bbox_overlaps def nms(dets: np.ndarray, thr: float) -> List[int]: @@ -325,3 +329,40 @@ def nearby_joints_nms( keep_pose_inds = [keep_pose_inds[i] for i in sub_inds] return keep_pose_inds + + +def nms_torch(bboxes: Tensor, + scores: Tensor, + threshold: float = 0.65, + iou_calculator=bbox_overlaps, + return_group: bool = False): + """Perform Non-Maximum Suppression (NMS) on a set of bounding boxes using + their corresponding scores. + + Args: + + bboxes (Tensor): list of bounding boxes (each containing 4 elements + for x1, y1, x2, y2). + scores (Tensor): scores associated with each bounding box. + threshold (float): IoU threshold to determine overlap. + iou_calculator (function): method to calculate IoU. + return_group (bool): if True, returns groups of overlapping bounding + boxes, otherwise returns the main bounding boxes. + """ + + _, indices = scores.sort(descending=True) + groups = [] + while len(indices): + idx, indices = indices[0], indices[1:] + bbox = bboxes[idx] + ious = iou_calculator(bbox, bboxes[indices]) + close_indices = torch.where(ious > threshold)[0] + keep_indices = torch.ones_like(indices, dtype=torch.bool) + keep_indices[close_indices] = 0 + groups.append(torch.cat((idx[None], indices[close_indices]))) + indices = indices[keep_indices] + + if return_group: + return groups + else: + return torch.cat([g[:1] for g in groups]) diff --git a/mmpose/evaluation/metrics/coco_metric.py b/mmpose/evaluation/metrics/coco_metric.py index 8b5e80d954..d1c7191338 100644 --- a/mmpose/evaluation/metrics/coco_metric.py +++ b/mmpose/evaluation/metrics/coco_metric.py @@ -13,6 +13,7 @@ from xtcocotools.cocoeval import COCOeval from mmpose.registry import METRICS, TRANSFORMS +from mmpose.structures.bbox import bbox_xyxy2xywh from ..functional import oks_nms, soft_oks_nms @@ -213,9 +214,13 @@ def process(self, data_batch: Sequence[dict], pred = dict() pred['id'] = data_sample['id'] pred['img_id'] = data_sample['img_id'] + pred['keypoints'] = keypoints pred['keypoint_scores'] = keypoint_scores pred['category_id'] = data_sample.get('category_id', 1) + if 'bboxes' in data_sample['pred_instances']: + pred['bbox'] = bbox_xyxy2xywh( + data_sample['pred_instances']['bboxes']) if 'bbox_scores' in data_sample['pred_instances']: # some one-stage models will predict bboxes and scores @@ -405,6 +410,8 @@ def compute_metrics(self, results: list) -> Dict[str, float]: 'keypoint_scores': pred['keypoint_scores'][idx], 'bbox_score': pred['bbox_scores'][idx], } + if 'bbox' in pred: + instance['bbox'] = pred['bbox'][idx] if 'areas' in pred: instance['area'] = pred['areas'][idx] @@ -510,12 +517,17 @@ def results2json(self, keypoints: Dict[int, list], # collect all the person keypoints in current image _keypoints = _keypoints.reshape(-1, num_keypoints * 3) - result = [{ - 'image_id': img_kpt['img_id'], - 'category_id': img_kpt['category_id'], - 'keypoints': keypoint.tolist(), - 'score': float(img_kpt['score']), - } for img_kpt, keypoint in zip(img_kpts, _keypoints)] + result = [] + for img_kpt, keypoint in zip(img_kpts, _keypoints): + res = { + 'image_id': img_kpt['img_id'], + 'category_id': img_kpt['category_id'], + 'keypoints': keypoint.tolist(), + 'score': float(img_kpt['score']), + } + if 'bbox' in img_kpt: + res['bbox'] = img_kpt['bbox'].tolist(), + result.append(res) cat_results.extend(result) diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index 563264eecf..1559b6288b 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet from .cpm import CPM +from .csp_darknet import CSPDarknet +from .cspnext import CSPNeXt from .dstformer import DSTFormer from .hourglass import HourglassNet from .hourglass_ae import HourglassAENet @@ -34,5 +36,6 @@ 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', 'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer', - 'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer' + 'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer', 'CSPDarknet', + 'CSPNeXt' ] diff --git a/mmpose/models/backbones/csp_darknet.py b/mmpose/models/backbones/csp_darknet.py new file mode 100644 index 0000000000..dbaba0cfd9 --- /dev/null +++ b/mmpose/models/backbones/csp_darknet.py @@ -0,0 +1,286 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.registry import MODELS +from ..utils import CSPLayer + + +class Focus(nn.Module): + """Focus width and height information into channel space. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_size (int): The kernel size of the convolution. Default: 1 + stride (int): The stride of the convolution. Default: 1 + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish'). + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')): + super().__init__() + self.conv = ConvModule( + in_channels * 4, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + ( + patch_top_left, + patch_bot_left, + patch_top_right, + patch_bot_right, + ), + dim=1, + ) + return self.conv(x) + + +class SPPBottleneck(BaseModule): + """Spatial pyramid pooling layer used in YOLOv3-SPP. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling + layers. Default: (5, 9, 13). + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_sizes=(5, 9, 13), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=None): + super().__init__(init_cfg) + mid_channels = in_channels // 2 + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_channels = mid_channels * (len(kernel_sizes) + 1) + self.conv2 = ConvModule( + conv2_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + x = self.conv1(x) + with torch.cuda.amp.autocast(enabled=False): + x = torch.cat( + [x] + [pooling(x) for pooling in self.poolings], dim=1) + x = self.conv2(x) + return x + + +@MODELS.register_module() +class CSPDarknet(BaseModule): + """CSP-Darknet backbone used in YOLOv5 and YOLOX. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Default: P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Default: 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Default: -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Default: False. + arch_ovewrite(list): Overwrite default arch settings. Default: None. + spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Default: (5, 9, 13). + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + Example: + >>> from mmpose.models import CSPDarknet + >>> import torch + >>> self = CSPDarknet(depth=53) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__(self, + arch='P5', + deepen_factor=1.0, + widen_factor=1.0, + out_indices=(2, 3, 4), + frozen_stages=-1, + use_depthwise=False, + arch_ovewrite=None, + spp_kernal_sizes=(5, 9, 13), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + norm_eval=False, + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu')): + super().__init__(init_cfg) + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('frozen_stages must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.use_depthwise = use_depthwise + self.norm_eval = norm_eval + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + self.stem = Focus( + 3, + int(arch_setting[0][0] * widen_factor), + kernel_size=3, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.layers = ['stem'] + + for i, (in_channels, out_channels, num_blocks, add_identity, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * widen_factor) + out_channels = int(out_channels * widen_factor) + num_blocks = max(round(num_blocks * deepen_factor), 1) + stage = [] + conv_layer = conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPBottleneck( + out_channels, + out_channels, + kernel_sizes=spp_kernal_sizes, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(csp_layer) + self.add_module(f'stage{i + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{i + 1}') + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(CSPDarknet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x): + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmpose/models/backbones/cspnext.py b/mmpose/models/backbones/cspnext.py new file mode 100644 index 0000000000..5275bb255a --- /dev/null +++ b/mmpose/models/backbones/cspnext.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional, Sequence, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.registry import MODELS +from mmpose.utils.typing import ConfigType +from ..utils import CSPLayer +from .csp_darknet import SPPBottleneck + + +@MODELS.register_module() +class CSPNeXt(BaseModule): + """CSPNeXt backbone used in RTMDet. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Defaults to (5, 9, 13). + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__( + self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + arch_ovewrite: dict = None, + spp_kernel_sizes: Sequence[int] = (5, 9, 13), + channel_attention: bool = True, + conv_cfg: Optional[ConfigType] = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU'), + norm_eval: bool = False, + init_cfg: Optional[ConfigType] = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + super().__init__(init_cfg=init_cfg) + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('frozen_stages must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.use_depthwise = use_depthwise + self.norm_eval = norm_eval + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.stem = nn.Sequential( + ConvModule( + 3, + int(arch_setting[0][0] * widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + int(arch_setting[0][0] * widen_factor // 2), + int(arch_setting[0][0] * widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + int(arch_setting[0][0] * widen_factor // 2), + int(arch_setting[0][0] * widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.layers = ['stem'] + + for i, (in_channels, out_channels, num_blocks, add_identity, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * widen_factor) + out_channels = int(out_channels * widen_factor) + num_blocks = max(round(num_blocks * deepen_factor), 1) + stage = [] + conv_layer = conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPBottleneck( + out_channels, + out_channels, + kernel_sizes=spp_kernel_sizes, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=use_depthwise, + use_cspnext_block=True, + expand_ratio=expand_ratio, + channel_attention=channel_attention, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(csp_layer) + self.add_module(f'stage{i + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{i + 1}') + + def _freeze_stages(self) -> None: + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True) -> None: + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]: + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmpose/models/data_preprocessors/__init__.py b/mmpose/models/data_preprocessors/__init__.py index 7c9bd22e2b..7abf9a6af0 100644 --- a/mmpose/models/data_preprocessors/__init__.py +++ b/mmpose/models/data_preprocessors/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .batch_augmentation import BatchSyncRandomResize from .data_preprocessor import PoseDataPreprocessor -__all__ = ['PoseDataPreprocessor'] +__all__ = ['PoseDataPreprocessor', 'BatchSyncRandomResize'] diff --git a/mmpose/models/data_preprocessors/batch_augmentation.py b/mmpose/models/data_preprocessors/batch_augmentation.py new file mode 100644 index 0000000000..e4dcd568e5 --- /dev/null +++ b/mmpose/models/data_preprocessors/batch_augmentation.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine import MessageHub +from mmengine.dist import barrier, broadcast, get_dist_info +from mmengine.structures import PixelData +from torch import Tensor + +from mmpose.registry import MODELS +from mmpose.structures import PoseDataSample + + +@MODELS.register_module() +class BatchSyncRandomResize(nn.Module): + """Batch random resize which synchronizes the random size across ranks. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + """ + + def __init__(self, + random_size_range: Tuple[int, int], + interval: int = 10, + size_divisor: int = 32) -> None: + super().__init__() + self.rank, self.world_size = get_dist_info() + self._input_size = None + self._random_size_range = (round(random_size_range[0] / size_divisor), + round(random_size_range[1] / size_divisor)) + self._interval = interval + self._size_divisor = size_divisor + + def forward(self, inputs: Tensor, data_samples: List[PoseDataSample] + ) -> Tuple[Tensor, List[PoseDataSample]]: + """resize a batch of images and bboxes to shape ``self._input_size``""" + h, w = inputs.shape[-2:] + if self._input_size is None: + self._input_size = (h, w) + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1 or scale_y != 1: + inputs = F.interpolate( + inputs, + size=self._input_size, + mode='bilinear', + align_corners=False) + for data_sample in data_samples: + img_shape = (int(data_sample.img_shape[0] * scale_y), + int(data_sample.img_shape[1] * scale_x)) + pad_shape = (int(data_sample.pad_shape[0] * scale_y), + int(data_sample.pad_shape[1] * scale_x)) + data_sample.set_metainfo({ + 'img_shape': img_shape, + 'pad_shape': pad_shape, + 'batch_input_shape': self._input_size + }) + + if 'gt_instance_labels' not in data_sample: + continue + + if 'bboxes' in data_sample.gt_instance_labels: + data_sample.gt_instance_labels.bboxes[..., 0::2] *= scale_x + data_sample.gt_instance_labels.bboxes[..., 1::2] *= scale_y + + if 'keypoints' in data_sample.gt_instance_labels: + data_sample.gt_instance_labels.keypoints[..., 0] *= scale_x + data_sample.gt_instance_labels.keypoints[..., 1] *= scale_y + + if 'areas' in data_sample.gt_instance_labels: + data_sample.gt_instance_labels.areas *= scale_x * scale_y + + if 'gt_fields' in data_sample \ + and 'heatmap_mask' in data_sample.gt_fields: + + mask = data_sample.gt_fields.heatmap_mask.unsqueeze(0) + gt_fields = PixelData() + gt_fields.set_field( + F.interpolate( + mask.float(), + size=self._input_size, + mode='bilinear', + align_corners=False).squeeze(0), 'heatmap_mask') + + data_sample.gt_fields = gt_fields + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + self._input_size = self._get_random_size( + aspect_ratio=float(w / h), device=inputs.device) + return inputs, data_samples + + def _get_random_size(self, aspect_ratio: float, + device: torch.device) -> Tuple[int, int]: + """Randomly generate a shape in ``_random_size_range`` and broadcast to + all ranks.""" + tensor = torch.LongTensor(2).to(device) + if self.rank == 0: + size = random.randint(*self._random_size_range) + size = (self._size_divisor * size, + self._size_divisor * int(aspect_ratio * size)) + tensor[0] = size[0] + tensor[1] = size[1] + barrier() + broadcast(tensor, 0) + input_size = (tensor[0].item(), tensor[1].item()) + return input_size diff --git a/mmpose/models/data_preprocessors/data_preprocessor.py b/mmpose/models/data_preprocessors/data_preprocessor.py index bcfe54ab59..b5ce1e7fdd 100644 --- a/mmpose/models/data_preprocessors/data_preprocessor.py +++ b/mmpose/models/data_preprocessors/data_preprocessor.py @@ -1,5 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Union + +import numpy as np +import torch +import torch.nn as nn from mmengine.model import ImgDataPreprocessor +from mmengine.utils import is_seq_of from mmpose.registry import MODELS @@ -7,3 +13,87 @@ @MODELS.register_module() class PoseDataPreprocessor(ImgDataPreprocessor): """Image pre-processor for pose estimation tasks.""" + + def __init__(self, + mean: Sequence[float] = None, + std: Sequence[float] = None, + pad_size_divisor: int = 1, + pad_value: Union[float, int] = 0, + bgr_to_rgb: bool = False, + rgb_to_bgr: bool = False, + non_blocking: Optional[bool] = False, + batch_augments: Optional[List[dict]] = None): + super().__init__( + mean=mean, + std=std, + pad_size_divisor=pad_size_divisor, + pad_value=pad_value, + bgr_to_rgb=bgr_to_rgb, + rgb_to_bgr=rgb_to_bgr, + non_blocking=non_blocking) + if batch_augments is not None: + self.batch_augments = nn.ModuleList( + [MODELS.build(aug) for aug in batch_augments]) + else: + self.batch_augments = None + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``BaseDataPreprocessor``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + batch_pad_shape = self._get_pad_shape(data) + data = super().forward(data=data, training=training) + inputs, data_samples = data['inputs'], data['data_samples'] + batch_input_shape = tuple(inputs[0].size()[-2:]) + for data_sample, pad_shape in zip(data_samples, batch_pad_shape): + data_sample.set_metainfo({ + 'batch_input_shape': batch_input_shape, + 'pad_shape': pad_shape + }) + + if training and self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + return {'inputs': inputs, 'data_samples': data_samples} + + def _get_pad_shape(self, data: dict) -> List[tuple]: + """Get the pad_shape of each image based on data and + pad_size_divisor.""" + _batch_inputs = data['inputs'] + # Process data with `pseudo_collate`. + if is_seq_of(_batch_inputs, torch.Tensor): + batch_pad_shape = [] + for ori_input in _batch_inputs: + pad_h = int( + np.ceil(ori_input.shape[1] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_w = int( + np.ceil(ori_input.shape[2] / + self.pad_size_divisor)) * self.pad_size_divisor + batch_pad_shape.append((pad_h, pad_w)) + # Process data with `default_collate`. + elif isinstance(_batch_inputs, torch.Tensor): + assert _batch_inputs.dim() == 4, ( + 'The input of `ImgDataPreprocessor` should be a NCHW tensor ' + 'or a list of tensor, but got a tensor with shape: ' + f'{_batch_inputs.shape}') + pad_h = int( + np.ceil(_batch_inputs.shape[1] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_w = int( + np.ceil(_batch_inputs.shape[2] / + self.pad_size_divisor)) * self.pad_size_divisor + batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0] + else: + raise TypeError('Output of `cast_data` should be a dict ' + 'or a tuple with inputs and data_samples, but got' + f'{type(data)}: {data}') + return batch_pad_shape diff --git a/mmpose/models/heads/hybrid_heads/__init__.py b/mmpose/models/heads/hybrid_heads/__init__.py index 6431b6a2c2..ff026ce855 100644 --- a/mmpose/models/heads/hybrid_heads/__init__.py +++ b/mmpose/models/heads/hybrid_heads/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .dekr_head import DEKRHead from .vis_head import VisPredictHead +from .yoloxpose_head import YOLOXPoseHead -__all__ = ['DEKRHead', 'VisPredictHead'] +__all__ = ['DEKRHead', 'VisPredictHead', 'YOLOXPoseHead'] diff --git a/mmpose/models/heads/hybrid_heads/yoloxpose_head.py b/mmpose/models/heads/hybrid_heads/yoloxpose_head.py new file mode 100644 index 0000000000..bdd25f7851 --- /dev/null +++ b/mmpose/models/heads/hybrid_heads/yoloxpose_head.py @@ -0,0 +1,752 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmpose.evaluation.functional import nms_torch +from mmpose.models.utils import filter_scores_and_topk +from mmpose.registry import MODELS, TASK_UTILS +from mmpose.structures import PoseDataSample +from mmpose.utils import reduce_mean +from mmpose.utils.typing import (ConfigType, Features, OptSampleList, + Predictions, SampleList) + + +class YOLOXPoseHeadModule(BaseModule): + """YOLOXPose head module for one-stage human pose estimation. + + This module predicts classification scores, bounding boxes, keypoint + offsets and visibilities from multi-level feature maps. + + Args: + num_classes (int): Number of categories excluding the background + category. + num_keypoints (int): Number of keypoints defined for one instance. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + feat_channels (int): Number of channels in the classification score + and objectness prediction branch. Defaults to 256. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_groups (int): Group number of group convolution layers in keypoint + regression branch. Defaults to 8. + channels_per_group (int): Number of channels for each group of group + convolution layers in keypoint regression branch. Defaults to 32. + featmap_strides (Sequence[int]): Downsample factor of each feature + map. Defaults to [8, 16, 32]. + conv_bias (bool or str): If specified as `auto`, it will be decided + by the norm_cfg. Bias of conv will be set as True if `norm_cfg` + is None, otherwise False. Defaults to "auto". + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_keypoints: int, + in_channels: Union[int, Sequence], + num_classes: int = 1, + widen_factor: float = 1.0, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + conv_bias: Union[bool, str] = 'auto', + conv_cfg: Optional[ConfigType] = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: Optional[ConfigType] = None, + ): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + if isinstance(in_channels, int): + in_channels = int(in_channels * widen_factor) + self.in_channels = in_channels + self.num_keypoints = num_keypoints + + self._init_layers() + + def _init_layers(self): + """Initialize heads for all level feature maps.""" + self._init_cls_branch() + self._init_reg_branch() + self._init_pose_branch() + + def _init_cls_branch(self): + """Initialize classification branch for all level feature maps.""" + self.conv_cls = nn.ModuleList() + for _ in self.featmap_strides: + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + stacked_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + self.conv_cls.append(nn.Sequential(*stacked_convs)) + + # output layers + self.out_cls = nn.ModuleList() + self.out_obj = nn.ModuleList() + for _ in self.featmap_strides: + self.out_cls.append( + nn.Conv2d(self.feat_channels, self.num_classes, 1)) + + def _init_reg_branch(self): + """Initialize classification branch for all level feature maps.""" + self.conv_reg = nn.ModuleList() + for _ in self.featmap_strides: + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + stacked_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + self.conv_reg.append(nn.Sequential(*stacked_convs)) + + # output layers + self.out_bbox = nn.ModuleList() + self.out_obj = nn.ModuleList() + for _ in self.featmap_strides: + self.out_bbox.append(nn.Conv2d(self.feat_channels, 4, 1)) + self.out_obj.append(nn.Conv2d(self.feat_channels, 1, 1)) + + def _init_pose_branch(self): + self.conv_pose = nn.ModuleList() + + for _ in self.featmap_strides: + stacked_convs = [] + for i in range(self.stacked_convs * 2): + in_chn = self.in_channels if i == 0 else self.feat_channels + stacked_convs.append( + ConvModule( + in_chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + self.conv_pose.append(nn.Sequential(*stacked_convs)) + + # output layers + self.out_kpt = nn.ModuleList() + self.out_kpt_vis = nn.ModuleList() + for _ in self.featmap_strides: + self.out_kpt.append( + nn.Conv2d(self.feat_channels, self.num_keypoints * 2, 1)) + self.out_kpt_vis.append( + nn.Conv2d(self.feat_channels, self.num_keypoints, 1)) + + def init_weights(self): + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(self.out_cls, self.out_obj): + conv_cls.bias.data.fill_(bias_init) + conv_obj.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + cls_scores (List[Tensor]): Classification scores for each level. + objectnesses (List[Tensor]): Objectness scores for each level. + bbox_preds (List[Tensor]): Bounding box predictions for each level. + kpt_offsets (List[Tensor]): Keypoint offsets for each level. + kpt_vis (List[Tensor]): Keypoint visibilities for each level. + """ + + cls_scores, bbox_preds, objectnesses = [], [], [] + kpt_offsets, kpt_vis = [], [] + + for i in range(len(x)): + + cls_feat = self.conv_cls[i](x[i]) + reg_feat = self.conv_reg[i](x[i]) + pose_feat = self.conv_pose[i](x[i]) + + cls_scores.append(self.out_cls[i](cls_feat)) + objectnesses.append(self.out_obj[i](reg_feat)) + bbox_preds.append(self.out_bbox[i](reg_feat)) + kpt_offsets.append(self.out_kpt[i](pose_feat)) + kpt_vis.append(self.out_kpt_vis[i](pose_feat)) + + return cls_scores, objectnesses, bbox_preds, kpt_offsets, kpt_vis + + +@MODELS.register_module() +class YOLOXPoseHead(BaseModule): + + def __init__( + self, + num_keypoints: int, + head_module_cfg: Optional[ConfigType] = None, + featmap_strides: Sequence[int] = [8, 16, 32], + num_classes: int = 1, + use_aux_loss: bool = False, + assigner: ConfigType = None, + prior_generator: ConfigType = None, + loss_cls: Optional[ConfigType] = None, + loss_obj: Optional[ConfigType] = None, + loss_bbox: Optional[ConfigType] = None, + loss_oks: Optional[ConfigType] = None, + loss_vis: Optional[ConfigType] = None, + loss_bbox_aux: Optional[ConfigType] = None, + loss_kpt_aux: Optional[ConfigType] = None, + overlaps_power: float = 1.0, + ): + super().__init__() + + self.featmap_sizes = None + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.use_aux_loss = use_aux_loss + self.num_keypoints = num_keypoints + self.overlaps_power = overlaps_power + + self.prior_generator = TASK_UTILS.build(prior_generator) + if head_module_cfg is not None: + head_module_cfg['featmap_strides'] = featmap_strides + head_module_cfg['num_keypoints'] = num_keypoints + self.head_module = YOLOXPoseHeadModule(**head_module_cfg) + self.assigner = TASK_UTILS.build(assigner) + + # build losses + self.loss_cls = MODELS.build(loss_cls) + self.loss_obj = MODELS.build(loss_obj) + self.loss_bbox = MODELS.build(loss_bbox) + self.loss_oks = MODELS.build(loss_oks) + self.loss_vis = MODELS.build(loss_vis) + if loss_bbox_aux is not None: + self.loss_bbox_aux = MODELS.build(loss_bbox_aux) + if loss_kpt_aux is not None: + self.loss_kpt_aux = MODELS.build(loss_kpt_aux) + + def forward(self, feats: Features): + assert isinstance(feats, (tuple, list)) + return self.head_module(feats) + + def loss(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + feats (Tuple[Tensor]): The multi-stage features + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + train_cfg (dict): The runtime config for training process. + Defaults to {} + + Returns: + dict: A dictionary of losses. + """ + + # 1. collect & reform predictions + cls_scores, objectnesses, bbox_preds, kpt_offsets, \ + kpt_vis = self.forward(feats) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + flatten_priors = torch.cat(mlvl_priors) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = self._flatten_predictions(cls_scores) + flatten_bbox_preds = self._flatten_predictions(bbox_preds) + flatten_objectness = self._flatten_predictions(objectnesses) + flatten_kpt_offsets = self._flatten_predictions(kpt_offsets) + flatten_kpt_vis = self._flatten_predictions(kpt_vis) + flatten_bbox_decoded = self.decode_bbox(flatten_bbox_preds, + flatten_priors[..., :2], + flatten_priors[..., -1]) + flatten_kpt_decoded = self.decode_kpt_reg(flatten_kpt_offsets, + flatten_priors[..., :2], + flatten_priors[..., -1]) + + # 2. generate targets + targets = self._get_targets(flatten_priors, + flatten_cls_scores.detach(), + flatten_objectness.detach(), + flatten_bbox_decoded.detach(), + flatten_kpt_decoded.detach(), + flatten_kpt_vis.detach(), + batch_data_samples) + pos_masks, cls_targets, obj_targets, obj_weights, \ + bbox_targets, bbox_aux_targets, kpt_targets, kpt_aux_targets, \ + vis_targets, vis_weights, pos_areas, pos_priors, group_indices, \ + num_fg_imgs = targets + + num_pos = torch.tensor( + sum(num_fg_imgs), + dtype=torch.float, + device=flatten_cls_scores.device) + num_total_samples = max(reduce_mean(num_pos), 1.0) + + # 3. calculate loss + # 3.1 objectness loss + losses = dict() + + obj_preds = flatten_objectness.view(-1, 1) + losses['loss_obj'] = self.loss_obj(obj_preds, obj_targets, + obj_weights) / num_total_samples + + if num_pos > 0: + # 3.2 bbox loss + bbox_preds = flatten_bbox_decoded.view(-1, 4)[pos_masks] + losses['loss_bbox'] = self.loss_bbox( + bbox_preds, bbox_targets) / num_total_samples + + # 3.3 keypoint loss + kpt_preds = flatten_kpt_decoded.view(-1, self.num_keypoints, + 2)[pos_masks] + losses['loss_kpt'] = self.loss_oks(kpt_preds, kpt_targets, + vis_targets, pos_areas) + + # 3.4 keypoint visibility loss + kpt_vis_preds = flatten_kpt_vis.view(-1, + self.num_keypoints)[pos_masks] + losses['loss_vis'] = self.loss_vis(kpt_vis_preds, vis_targets, + vis_weights) + + # 3.5 classification loss + cls_preds = flatten_cls_scores.view(-1, + self.num_classes)[pos_masks] + cls_targets = cls_targets.pow(self.overlaps_power).detach() + losses['loss_cls'] = self.loss_cls(cls_preds, + cls_targets) / num_total_samples + + if self.use_aux_loss: + if hasattr(self, 'loss_bbox_aux'): + # 3.6 auxiliary bbox regression loss + bbox_preds_raw = flatten_bbox_preds.view(-1, 4)[pos_masks] + losses['loss_bbox_aux'] = self.loss_bbox_aux( + bbox_preds_raw, bbox_aux_targets) / num_total_samples + + if hasattr(self, 'loss_kpt_aux'): + # 3.7 auxiliary keypoint regression loss + kpt_preds_raw = flatten_kpt_offsets.view( + -1, self.num_keypoints, 2)[pos_masks] + kpt_weights = vis_targets / vis_targets.size(-1) + losses['loss_kpt_aux'] = self.loss_kpt_aux( + kpt_preds_raw, kpt_aux_targets, kpt_weights) + + return losses + + @torch.no_grad() + def _get_targets( + self, + priors: Tensor, + batch_cls_scores: Tensor, + batch_objectness: Tensor, + batch_decoded_bboxes: Tensor, + batch_decoded_kpts: Tensor, + batch_kpt_vis: Tensor, + batch_data_samples: SampleList, + ): + num_imgs = len(batch_data_samples) + + # use clip to avoid nan + batch_cls_scores = batch_cls_scores.clip(min=-1e4, max=1e4).sigmoid() + batch_objectness = batch_objectness.clip(min=-1e4, max=1e4).sigmoid() + batch_kpt_vis = batch_kpt_vis.clip(min=-1e4, max=1e4).sigmoid() + batch_cls_scores[torch.isnan(batch_cls_scores)] = 0 + batch_objectness[torch.isnan(batch_objectness)] = 0 + + targets_each = [] + for i in range(num_imgs): + target = self._get_targets_single(priors, batch_cls_scores[i], + batch_objectness[i], + batch_decoded_bboxes[i], + batch_decoded_kpts[i], + batch_kpt_vis[i], + batch_data_samples[i]) + targets_each.append(target) + + targets = list(zip(*targets_each)) + for i, target in enumerate(targets): + if torch.is_tensor(target[0]): + target = tuple(filter(lambda x: x.size(0) > 0, target)) + targets[i] = torch.cat(target) + + foreground_masks, cls_targets, obj_targets, obj_weights, \ + bbox_targets, kpt_targets, vis_targets, vis_weights, pos_areas, \ + pos_priors, group_indices, num_pos_per_img = targets + + # post-processing for targets + if self.use_aux_loss: + bbox_cxcy = (bbox_targets[:, :2] + bbox_targets[:, 2:]) / 2.0 + bbox_wh = bbox_targets[:, 2:] - bbox_targets[:, :2] + bbox_aux_targets = torch.cat([ + (bbox_cxcy - pos_priors[:, :2]) / pos_priors[:, 2:], + torch.log(bbox_wh / pos_priors[:, 2:] + 1e-8) + ], + dim=-1) + + kpt_aux_targets = (kpt_targets - pos_priors[:, None, :2]) \ + / pos_priors[:, None, 2:] + else: + bbox_aux_targets, kpt_aux_targets = None, None + + return (foreground_masks, cls_targets, obj_targets, obj_weights, + bbox_targets, bbox_aux_targets, kpt_targets, kpt_aux_targets, + vis_targets, vis_weights, pos_areas, pos_priors, group_indices, + num_pos_per_img) + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_scores: Tensor, + objectness: Tensor, + decoded_bboxes: Tensor, + decoded_kpts: Tensor, + kpt_vis: Tensor, + data_sample: PoseDataSample, + ) -> tuple: + """Compute classification, bbox, keypoints and objectness targets for + priors in a single image. + + Args: + priors (Tensor): All priors of one image, a 2D-Tensor with shape + [num_priors, 4] in [cx, xy, stride_w, stride_y] format. + cls_scores (Tensor): Classification predictions of one image, + a 2D-Tensor with shape [num_priors, num_classes] + objectness (Tensor): Objectness predictions of one image, + a 1D-Tensor with shape [num_priors] + decoded_bboxes (Tensor): Decoded bboxes predictions of one image, + a 2D-Tensor with shape [num_priors, 4] in xyxy format. + decoded_kpts (Tensor): Decoded keypoints predictions of one image, + a 3D-Tensor with shape [num_priors, num_keypoints, 2]. + kpt_vis (Tensor): Keypoints visibility predictions of one image, + a 2D-Tensor with shape [num_priors, num_keypoints]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + data_sample (PoseDataSample): Data sample that contains the ground + truth annotations for current image. + + Returns: + # TODO: modify the description of returned values + tuple: + foreground_mask (list[Tensor]): Binary mask of foreground + targets. + cls_target (list[Tensor]): Classification targets of an image. + obj_target (list[Tensor]): Objectness targets of an image. + bbox_target (list[Tensor]): BBox targets of an image. + bbox_aux_target (int): BBox aux targets of an image. + num_pos_per_img (int): Number of positive samples in an image. + """ + # TODO: change the shape of objectness to [num_priors] + num_priors = priors.size(0) + gt_instances = data_sample.gt_instance_labels + num_gts = len(gt_instances) + + # No target + if num_gts == 0: + cls_target = cls_scores.new_zeros((0, self.num_classes)) + bbox_target = cls_scores.new_zeros((0, 4)) + obj_target = cls_scores.new_zeros((num_priors, 1)) + obj_weight = cls_scores.new_ones((num_priors, 1)) + kpt_target = cls_scores.new_zeros((0, self.num_keypoints, 2)) + vis_target = cls_scores.new_zeros((0, self.num_keypoints)) + vis_weight = cls_scores.new_zeros((0, self.num_keypoints)) + pos_areas = cls_scores.new_zeros((0, )) + pos_priors = priors[:0] + foreground_mask = cls_scores.new_zeros(num_priors).bool() + return (foreground_mask, cls_target, obj_target, obj_weight, + bbox_target, kpt_target, vis_target, vis_weight, pos_areas, + pos_priors, [], 0) + + # assign positive samples + scores = cls_scores * objectness + pred_instances = InstanceData( + bboxes=decoded_bboxes, + scores=scores.sqrt_(), + priors=priors, + keypoints=decoded_kpts, + keypoints_visible=kpt_vis, + ) + assign_result = self.assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + + # sampling + pos_inds = torch.nonzero( + assign_result['gt_inds'] > 0, as_tuple=False).squeeze(-1).unique() + num_pos_per_img = pos_inds.size(0) + pos_gt_labels = assign_result['labels'][pos_inds] + pos_assigned_gt_inds = assign_result['gt_inds'][pos_inds] - 1 + + # bbox target + bbox_target = gt_instances.bboxes[pos_assigned_gt_inds.long()] + + # cls target + max_overlaps = assign_result['max_overlaps'][pos_inds] + cls_target = F.one_hot(pos_gt_labels, + self.num_classes) * max_overlaps.unsqueeze(-1) + + # pose targets + kpt_target = gt_instances.keypoints[pos_assigned_gt_inds] + vis_target = gt_instances.keypoints_visible[pos_assigned_gt_inds] + if 'keypoints_visible_weights' in gt_instances: + vis_weight = gt_instances.keypoints_visible_weights[ + pos_assigned_gt_inds] + else: + vis_weight = vis_target.new_ones(vis_target.shape) + pos_areas = gt_instances.areas[pos_assigned_gt_inds] + + # obj target + obj_target = torch.zeros_like(objectness) + obj_target[pos_inds] = 1 + obj_weight = obj_target.new_ones(obj_target.shape) + + # misc + foreground_mask = torch.zeros_like(objectness.squeeze()).to(torch.bool) + foreground_mask[pos_inds] = 1 + pos_priors = priors[pos_inds] + group_index = [ + torch.where(pos_assigned_gt_inds == num)[0] + for num in torch.unique(pos_assigned_gt_inds) + ] + + return (foreground_mask, cls_target, obj_target, obj_weight, + bbox_target, kpt_target, vis_target, vis_weight, pos_areas, + pos_priors, group_index, num_pos_per_img) + + def predict(self, + feats: Features, + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from features. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-scale features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If + ``test_cfg['output_heatmap']==True``, return both pose and heatmap + prediction; otherwise only return the pose prediction. + + The pose prediction is a list of ``InstanceData``, each contains + the following fields: + + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + + The heatmap prediction is a list of ``PixelData``, each contains + the following fields: + + - heatmaps (Tensor): The predicted heatmaps in shape (1, h, w) + or (K+1, h, w) if keypoint heatmaps are predicted + - displacements (Tensor): The predicted displacement fields + in shape (K*2, h, w) + """ + + cls_scores, objectnesses, bbox_preds, kpt_offsets, \ + kpt_vis = self.forward(feats) + + cfg = copy.deepcopy(test_cfg) + + batch_img_metas = [d.metainfo for d in batch_data_samples] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full((featmap_size.numel(), ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = self._flatten_predictions(cls_scores).sigmoid() + flatten_bbox_preds = self._flatten_predictions(bbox_preds) + flatten_objectness = self._flatten_predictions(objectnesses).sigmoid() + flatten_kpt_offsets = self._flatten_predictions(kpt_offsets) + flatten_kpt_vis = self._flatten_predictions(kpt_vis).sigmoid() + flatten_bbox_preds = self.decode_bbox(flatten_bbox_preds, + flatten_priors, flatten_stride) + flatten_kpt_reg = self.decode_kpt_reg(flatten_kpt_offsets, + flatten_priors, flatten_stride) + + results_list = [] + for (bboxes, scores, objectness, kpt_reg, kpt_vis, + img_meta) in zip(flatten_bbox_preds, flatten_cls_scores, + flatten_objectness, flatten_kpt_reg, + flatten_kpt_vis, batch_img_metas): + + score_thr = cfg.get('score_thr', 0.01) + scores *= objectness + + nms_pre = cfg.get('nms_pre', 100000) + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs_score, results = filter_scores_and_topk( + scores, score_thr, nms_pre, results=dict(labels=labels[:, 0])) + labels = results['labels'] + + bboxes = bboxes[keep_idxs_score] + kpt_vis = kpt_vis[keep_idxs_score] + stride = flatten_stride[keep_idxs_score] + keypoints = kpt_reg[keep_idxs_score] + + if bboxes.numel() > 0: + nms_thr = cfg.get('nms_thr', 1.0) + if nms_thr < 1.0: + keep_idxs_nms = nms_torch(bboxes, scores, nms_thr) + bboxes = bboxes[keep_idxs_nms] + stride = stride[keep_idxs_nms] + labels = labels[keep_idxs_nms] + kpt_vis = kpt_vis[keep_idxs_nms] + keypoints = keypoints[keep_idxs_nms] + scores = scores[keep_idxs_nms] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes, + bbox_scores=scores, + keypoints=keypoints, + keypoint_scores=kpt_vis, + keypoints_visible=kpt_vis) + + input_size = img_meta['input_size'] + results.bboxes[:, 0::2].clamp_(0, input_size[0]) + results.bboxes[:, 1::2].clamp_(0, input_size[1]) + + results_list.append(results.numpy()) + + return results_list + + def decode_bbox(self, pred_bboxes: torch.Tensor, priors: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_y, log_w, log_h) to + bounding boxes (tl_x, tl_y, br_x, br_y). + + Note: + - batch size: B + - token number: N + + Args: + pred_bboxes (torch.Tensor): Encoded boxes with shape (B, N, 4), + representing (delta_x, delta_y, log_w, log_h) for each box. + priors (torch.Tensor): Anchors coordinates, with shape (N, 2). + stride (torch.Tensor | int): Strides of the bboxes. It can be a + single value if the same stride applies to all boxes, or it + can be a tensor of shape (N, ) if different strides are used + for each box. + + Returns: + torch.Tensor: Decoded bounding boxes with shape (N, 4), + representing (tl_x, tl_y, br_x, br_y) for each box. + """ + stride = stride.view(1, stride.size(0), 1) + priors = priors.view(1, priors.size(0), 2) + + xys = (pred_bboxes[..., :2] * stride) + priors + whs = pred_bboxes[..., 2:].exp() * stride + + # Calculate bounding box corners + tl_x = xys[..., 0] - whs[..., 0] / 2 + tl_y = xys[..., 1] - whs[..., 1] / 2 + br_x = xys[..., 0] + whs[..., 0] / 2 + br_y = xys[..., 1] + whs[..., 1] / 2 + + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + def decode_kpt_reg(self, pred_kpt_offsets: torch.Tensor, + priors: torch.Tensor, + stride: torch.Tensor) -> torch.Tensor: + """Decode regression results (delta_x, delta_y) to keypoints + coordinates (x, y). + + Args: + pred_kpt_offsets (torch.Tensor): Encoded keypoints offsets with + shape (batch_size, num_anchors, num_keypoints, 2). + priors (torch.Tensor): Anchors coordinates with shape + (num_anchors, 2). + stride (torch.Tensor): Strides of the anchors. + + Returns: + torch.Tensor: Decoded keypoints coordinates with shape + (batch_size, num_boxes, num_keypoints, 2). + """ + stride = stride.view(1, stride.size(0), 1, 1) + priors = priors.view(1, priors.size(0), 1, 2) + pred_kpt_offsets = pred_kpt_offsets.reshape( + *pred_kpt_offsets.shape[:-1], self.num_keypoints, 2) + + decoded_kpts = pred_kpt_offsets * stride + priors + return decoded_kpts + + def _flatten_predictions(self, preds: List[Tensor]): + """Flattens the predictions from a list of tensors to a single + tensor.""" + preds = [x.permute(0, 2, 3, 1).flatten(1, 2) for x in preds] + return torch.cat(preds, dim=1) diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py index 57ba98fe46..92ed569bab 100644 --- a/mmpose/models/losses/__init__.py +++ b/mmpose/models/losses/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .ae_loss import AssociativeEmbeddingLoss +from .bbox_loss import IoULoss from .classification_loss import BCELoss, JSDiscretLoss, KLDiscretLoss from .fea_dis_loss import FeaLoss from .heatmap_loss import (AdaptiveWingLoss, KeypointMSELoss, @@ -7,8 +8,8 @@ from .logit_dis_loss import KDLoss from .loss_wrappers import CombinedLoss, MultipleLossWrapper from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, - MPJPEVelocityJointLoss, MSELoss, RLELoss, - SemiSupervisionLoss, SmoothL1Loss, + MPJPEVelocityJointLoss, MSELoss, OKSLoss, + RLELoss, SemiSupervisionLoss, SmoothL1Loss, SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss) __all__ = [ @@ -17,5 +18,5 @@ 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss', 'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss', 'CombinedLoss', 'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss', - 'MPJPEVelocityJointLoss', 'FeaLoss', 'KDLoss' + 'MPJPEVelocityJointLoss', 'FeaLoss', 'KDLoss', 'OKSLoss', 'IoULoss' ] diff --git a/mmpose/models/losses/bbox_loss.py b/mmpose/models/losses/bbox_loss.py new file mode 100644 index 0000000000..b216dcdb4a --- /dev/null +++ b/mmpose/models/losses/bbox_loss.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + +import torch.nn as nn +import torch.nn.functional as F + +from mmpose.registry import MODELS +from mmpose.structures.bbox import bbox_overlaps + + +@MODELS.register_module() +class IoULoss(nn.Module): + """Binary Cross Entropy loss. + + Args: + reduction (str): Options are "none", "mean" and "sum". + eps (float): Epsilon to avoid log(0). + loss_weight (float): Weight of the loss. Default: 1.0. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'log' + """ + + def __init__(self, + reduction='mean', + mode='log', + eps: float = 1e-16, + loss_weight=1.): + super().__init__() + + assert reduction in ('mean', 'sum', 'none'), f'the argument ' \ + f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \ + f'but got {reduction}' + + assert mode in ('linear', 'square', 'log'), f'the argument ' \ + f'`reduction` should be either \'linear\', \'square\' or ' \ + f'\'log\', but got {mode}' + + self.reduction = reduction + self.criterion = partial(F.cross_entropy, reduction='none') + self.loss_weight = loss_weight + self.mode = mode + self.eps = eps + + def forward(self, output, target): + """Forward function. + + Note: + - batch_size: N + - num_labels: K + + Args: + output (torch.Tensor[N, K]): Output classification. + target (torch.Tensor[N, K]): Target classification. + """ + ious = bbox_overlaps( + output, target, is_aligned=True).clamp(min=self.eps) + + if self.mode == 'linear': + loss = 1 - ious + elif self.mode == 'square': + loss = 1 - ious.pow(2) + elif self.mode == 'log': + loss = -ious.log() + else: + raise NotImplementedError + + if self.reduction == 'sum': + loss = loss.sum() + elif self.reduction == 'mean': + loss = loss.mean() + + return loss * self.loss_weight diff --git a/mmpose/models/losses/classification_loss.py b/mmpose/models/losses/classification_loss.py index 5d2a2c7a58..2421e74819 100644 --- a/mmpose/models/losses/classification_loss.py +++ b/mmpose/models/losses/classification_loss.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + import torch import torch.nn as nn import torch.nn.functional as F @@ -13,6 +15,7 @@ class BCELoss(nn.Module): Args: use_target_weight (bool): Option to use weighted loss. Different joint types may have different target weights. + reduction (str): Options are "none", "mean" and "sum". loss_weight (float): Weight of the loss. Default: 1.0. use_sigmoid (bool, optional): Whether the prediction uses sigmoid before output. Defaults to False. @@ -21,11 +24,19 @@ class BCELoss(nn.Module): def __init__(self, use_target_weight=False, loss_weight=1., + reduction='mean', use_sigmoid=False): super().__init__() + + assert reduction in ('mean', 'sum', 'none'), f'the argument ' \ + f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \ + f'but got {reduction}' + + self.reduction = reduction self.use_sigmoid = use_sigmoid - self.criterion = F.binary_cross_entropy if use_sigmoid \ + criterion = F.binary_cross_entropy if use_sigmoid \ else F.binary_cross_entropy_with_logits + self.criterion = partial(criterion, reduction='none') self.use_target_weight = use_target_weight self.loss_weight = loss_weight @@ -45,13 +56,18 @@ def forward(self, output, target, target_weight=None): if self.use_target_weight: assert target_weight is not None - loss = self.criterion(output, target, reduction='none') + loss = self.criterion(output, target) if target_weight.dim() == 1: target_weight = target_weight[:, None] - loss = (loss * target_weight).mean() + loss = (loss * target_weight) else: loss = self.criterion(output, target) + if self.reduction == 'sum': + loss = loss.sum() + elif self.reduction == 'mean': + loss = loss.mean() + return loss * self.loss_weight diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index b50ad99f04..948d65bae7 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import math from functools import partial +from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F +from mmpose.datasets.datasets.utils import parse_pose_metainfo from mmpose.registry import MODELS from ..utils.realnvp import RealNVP @@ -485,11 +487,19 @@ def forward(self, output, target, target_weight=None): @MODELS.register_module() class L1Loss(nn.Module): - """L1Loss loss .""" + """L1Loss loss.""" - def __init__(self, use_target_weight=False, loss_weight=1.): + def __init__(self, + reduction='mean', + use_target_weight=False, + loss_weight=1.): super().__init__() - self.criterion = F.l1_loss + + assert reduction in ('mean', 'sum', 'none'), f'the argument ' \ + f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \ + f'but got {reduction}' + + self.criterion = partial(F.l1_loss, reduction=reduction) self.use_target_weight = use_target_weight self.loss_weight = loss_weight @@ -508,6 +518,8 @@ def forward(self, output, target, target_weight=None): """ if self.use_target_weight: assert target_weight is not None + for _ in range(target.ndim - target_weight.ndim): + target_weight = target_weight.unsqueeze(-1) loss = self.criterion(output * target_weight, target * target_weight) else: @@ -694,3 +706,108 @@ def forward(self, output, target): losses['bone_loss'] = loss_bone return losses + + +@MODELS.register_module() +class OKSLoss(nn.Module): + """A PyTorch implementation of the Object Keypoint Similarity (OKS) loss as + described in the paper "YOLO-Pose: Enhancing YOLO for Multi Person Pose + Estimation Using Object Keypoint Similarity Loss" by Debapriya et al. + (2022). + + The OKS loss is used for keypoint-based object recognition and consists + of a measure of the similarity between predicted and ground truth + keypoint locations, adjusted by the size of the object in the image. + + The loss function takes as input the predicted keypoint locations, the + ground truth keypoint locations, a mask indicating which keypoints are + valid, and bounding boxes for the objects. + + Args: + metainfo (Optional[str]): Path to a JSON file containing information + about the dataset's annotations. + reduction (str): Options are "none", "mean" and "sum". + eps (float): Epsilon to avoid log(0). + loss_weight (float): Weight of the loss. Default: 1.0. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'linear' + norm_target_weight (bool): whether to normalize the target weight + with number of visible keypoints. Defaults to False. + """ + + def __init__(self, + metainfo: Optional[str] = None, + reduction='mean', + mode='linear', + eps=1e-8, + norm_target_weight=False, + loss_weight=1.): + super().__init__() + + assert reduction in ('mean', 'sum', 'none'), f'the argument ' \ + f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \ + f'but got {reduction}' + + assert mode in ('linear', 'square', 'log'), f'the argument ' \ + f'`reduction` should be either \'linear\', \'square\' or ' \ + f'\'log\', but got {mode}' + + self.reduction = reduction + self.loss_weight = loss_weight + self.mode = mode + self.norm_target_weight = norm_target_weight + self.eps = eps + + if metainfo is not None: + metainfo = parse_pose_metainfo(dict(from_file=metainfo)) + sigmas = metainfo.get('sigmas', None) + if sigmas is not None: + self.register_buffer('sigmas', torch.as_tensor(sigmas)) + + def forward(self, output, target, target_weight=None, areas=None): + """Forward function. + + Note: + - batch_size: N + - num_labels: K + + Args: + output (torch.Tensor[N, K, 2]): Output keypoints coordinates. + target (torch.Tensor[N, K, 2]): Target keypoints coordinates.. + target_weight (torch.Tensor[N, K]): Loss weight for each keypoint. + areas (torch.Tensor[N]): Instance size which is adopted as + normalization factor. + """ + dist = torch.norm(output - target, dim=-1) + if areas is not None: + dist = dist / areas.pow(0.5).clip(min=self.eps).unsqueeze(-1) + if hasattr(self, 'sigmas'): + sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1) + dist = dist / (sigmas * 2) + + oks = torch.exp(-dist.pow(2) / 2) + + if target_weight is not None: + if self.norm_target_weight: + target_weight = target_weight / target_weight.sum( + dim=-1, keepdims=True).clip(min=self.eps) + else: + target_weight = target_weight / target_weight.size(-1) + oks = oks * target_weight + oks = oks.sum(dim=-1) + + if self.mode == 'linear': + loss = 1 - oks + elif self.mode == 'square': + loss = 1 - oks.pow(2) + elif self.mode == 'log': + loss = -oks.log() + else: + raise NotImplementedError() + + if self.reduction == 'sum': + loss = loss.sum() + elif self.reduction == 'mean': + loss = loss.mean() + + return loss * self.loss_weight diff --git a/mmpose/models/necks/__init__.py b/mmpose/models/necks/__init__.py index b4f9105cb3..c9d14cefc8 100644 --- a/mmpose/models/necks/__init__.py +++ b/mmpose/models/necks/__init__.py @@ -1,9 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .channel_mapper import ChannelMapper from .fmap_proc_neck import FeatureMapProcessor from .fpn import FPN from .gap_neck import GlobalAveragePooling from .posewarper_neck import PoseWarperNeck +from .yolox_pafpn import YOLOXPAFPN __all__ = [ - 'GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'FeatureMapProcessor' + 'GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'FeatureMapProcessor', + 'ChannelMapper', 'YOLOXPAFPN' ] diff --git a/mmpose/models/necks/channel_mapper.py b/mmpose/models/necks/channel_mapper.py new file mode 100644 index 0000000000..246ed363d8 --- /dev/null +++ b/mmpose/models/necks/channel_mapper.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmpose.registry import MODELS +from mmpose.utils.typing import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class ChannelMapper(BaseModule): + """Channel Mapper to reduce/increase channels of backbone features. + + This is used to reduce/increase channels of backbone features. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + kernel_size (int, optional): kernel_size for reducing channels (used + at each scale). Default: 3. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Default: None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Default: None. + act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + activation layer in ConvModule. Default: dict(type='ReLU'). + num_outs (int, optional): Number of output feature maps. There would + be extra_convs when num_outs larger than the length of in_channels. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict], + optional): Initialization config dict. + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = ChannelMapper(in_channels, 11, 3).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__( + self, + in_channels: List[int], + out_channels: int, + kernel_size: int = 3, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + act_cfg: OptConfigType = dict(type='ReLU'), + num_outs: int = None, + init_cfg: OptMultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(in_channels, list) + self.extra_convs = None + if num_outs is None: + num_outs = len(in_channels) + self.convs = nn.ModuleList() + for in_channel in in_channels: + self.convs.append( + ConvModule( + in_channel, + out_channels, + kernel_size, + padding=(kernel_size - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + if num_outs > len(in_channels): + self.extra_convs = nn.ModuleList() + for i in range(len(in_channels), num_outs): + if i == len(in_channels): + in_channel = in_channels[-1] + else: + in_channel = out_channels + self.extra_convs.append( + ConvModule( + in_channel, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]: + """Forward function.""" + assert len(inputs) == len(self.convs) + outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] + if self.extra_convs: + for i in range(len(self.extra_convs)): + if i == 0: + outs.append(self.extra_convs[0](inputs[-1])) + else: + outs.append(self.extra_convs[i](outs[-1])) + return tuple(outs) diff --git a/mmpose/models/necks/yolox_pafpn.py b/mmpose/models/necks/yolox_pafpn.py new file mode 100644 index 0000000000..adc4cfffa3 --- /dev/null +++ b/mmpose/models/necks/yolox_pafpn.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule + +from mmpose.registry import MODELS +from ..utils import CSPLayer + + +@MODELS.register_module() +class YOLOXPAFPN(BaseModule): + """Path Aggregation Network used in YOLOX. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: False + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish') + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_csp_blocks=3, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu')): + super(YOLOXPAFPN, self).__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + # build top-down blocks + self.upsample = nn.Upsample(**upsample_cfg) + self.reduce_layers = nn.ModuleList() + self.top_down_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.reduce_layers.append( + ConvModule( + in_channels[idx], + in_channels[idx - 1], + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.top_down_blocks.append( + CSPLayer( + in_channels[idx - 1] * 2, + in_channels[idx - 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + # build bottom-up blocks + self.downsamples = nn.ModuleList() + self.bottom_up_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv( + in_channels[idx], + in_channels[idx], + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottom_up_blocks.append( + CSPLayer( + in_channels[idx] * 2, + in_channels[idx + 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.out_convs = nn.ModuleList() + for i in range(len(in_channels)): + self.out_convs.append( + ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + + Returns: + tuple[Tensor]: YOLOXPAFPN features. + """ + assert len(inputs) == len(self.in_channels) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx]( + feat_heigh) + inner_outs[0] = feat_heigh + + upsample_feat = self.upsample(feat_heigh) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + torch.cat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx]( + torch.cat([downsample_feat, feat_height], 1)) + outs.append(out) + + # out convs + for idx, conv in enumerate(self.out_convs): + outs[idx] = conv(outs[idx]) + + return tuple(outs) diff --git a/mmpose/models/pose_estimators/base.py b/mmpose/models/pose_estimators/base.py index 0ae921d0ec..e98b2caeb8 100644 --- a/mmpose/models/pose_estimators/base.py +++ b/mmpose/models/pose_estimators/base.py @@ -3,6 +3,8 @@ from typing import Tuple, Union import torch +from mmengine.dist import get_world_size +from mmengine.logging import print_log from mmengine.model import BaseModel from torch import Tensor @@ -22,6 +24,7 @@ class BasePoseEstimator(BaseModel, metaclass=ABCMeta): config of :class:`BaseDataPreprocessor`. Defaults to ``None`` init_cfg (dict | ConfigDict): The model initialization config. Defaults to ``None`` + use_syncbn (bool): whether to use SyncBatchNorm. Defaults to False. metainfo (dict): Meta information for dataset, such as keypoints definition and properties. If set, the metainfo of the input data batch will be overridden. For more details, please refer to @@ -38,11 +41,14 @@ def __init__(self, train_cfg: OptConfigType = None, test_cfg: OptConfigType = None, data_preprocessor: OptConfigType = None, + use_syncbn: bool = False, init_cfg: OptMultiConfig = None, metainfo: Optional[dict] = None): super().__init__( data_preprocessor=data_preprocessor, init_cfg=init_cfg) self.metainfo = self._load_metainfo(metainfo) + self.train_cfg = train_cfg if train_cfg else {} + self.test_cfg = test_cfg if test_cfg else {} self.backbone = MODELS.build(backbone) @@ -57,13 +63,16 @@ def __init__(self, if head is not None: self.head = MODELS.build(head) - - self.train_cfg = train_cfg if train_cfg else {} - self.test_cfg = test_cfg if test_cfg else {} + self.head.test_cfg = self.test_cfg.copy() # Register the hook to automatically convert old version state dicts self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) + # TODO: Waiting for mmengine support + if use_syncbn and get_world_size() > 1: + torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) + print_log('Using SyncBatchNorm()', 'current') + @property def with_neck(self) -> bool: """bool: whether the pose estimator has a neck.""" diff --git a/mmpose/models/pose_estimators/bottomup.py b/mmpose/models/pose_estimators/bottomup.py index e7d2aaef88..7b82980a13 100644 --- a/mmpose/models/pose_estimators/bottomup.py +++ b/mmpose/models/pose_estimators/bottomup.py @@ -23,6 +23,7 @@ class BottomupPoseEstimator(BasePoseEstimator): Defaults to ``None`` test_cfg (dict, optional): The runtime config for testing process. Defaults to ``None`` + use_syncbn (bool): whether to use SyncBatchNorm. Defaults to False. data_preprocessor (dict, optional): The data preprocessing config to build the instance of :class:`BaseDataPreprocessor`. Defaults to ``None``. @@ -36,6 +37,7 @@ def __init__(self, head: OptConfigType = None, train_cfg: OptConfigType = None, test_cfg: OptConfigType = None, + use_syncbn: bool = False, data_preprocessor: OptConfigType = None, init_cfg: OptMultiConfig = None): super().__init__( @@ -44,6 +46,7 @@ def __init__(self, head=head, train_cfg=train_cfg, test_cfg=test_cfg, + use_syncbn=use_syncbn, data_preprocessor=data_preprocessor, init_cfg=init_cfg) @@ -162,17 +165,25 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, for pred_instances, pred_fields, data_sample in zip_longest( batch_pred_instances, batch_pred_fields, batch_data_samples): - # convert keypoint coordinates from input space to image space input_size = data_sample.metainfo['input_size'] input_center = data_sample.metainfo['input_center'] input_scale = data_sample.metainfo['input_scale'] + # convert keypoint coordinates from input space to image space pred_instances.keypoints = pred_instances.keypoints / input_size \ * input_scale + input_center - 0.5 * input_scale if 'keypoints_visible' not in pred_instances: pred_instances.keypoints_visible = \ pred_instances.keypoint_scores + # convert bbox coordinates from input space to image space + if 'bboxes' in pred_instances: + bboxes = pred_instances.bboxes.reshape( + pred_instances.bboxes.shape[0], 2, 2) + bboxes = bboxes / input_size * input_scale + input_center \ + - 0.5 * input_scale + pred_instances.bboxes = bboxes.reshape(bboxes.shape[0], 4) + data_sample.pred_instances = pred_instances if pred_fields is not None: diff --git a/mmpose/models/task_modules/__init__.py b/mmpose/models/task_modules/__init__.py new file mode 100644 index 0000000000..caecfb9d33 --- /dev/null +++ b/mmpose/models/task_modules/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigners import * # noqa +from .prior_generators import * # noqa diff --git a/mmpose/models/task_modules/assigners/__init__.py b/mmpose/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000000..7b6b006e38 --- /dev/null +++ b/mmpose/models/task_modules/assigners/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .metric_calculators import BBoxOverlaps2D, PoseOKS +from .sim_ota_assigner import SimOTAAssigner + +__all__ = ['SimOTAAssigner', 'PoseOKS', 'BBoxOverlaps2D'] diff --git a/mmpose/models/task_modules/assigners/metric_calculators.py b/mmpose/models/task_modules/assigners/metric_calculators.py new file mode 100644 index 0000000000..ebf4333b66 --- /dev/null +++ b/mmpose/models/task_modules/assigners/metric_calculators.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + +from mmpose.datasets.datasets.utils import parse_pose_metainfo +from mmpose.registry import TASK_UTILS +from mmpose.structures.bbox import bbox_overlaps + + +def cast_tensor_type(x, scale=1., dtype=None): + if dtype == 'fp16': + # scale is for preventing overflows + x = (x / scale).half() + return x + + +@TASK_UTILS.register_module() +class BBoxOverlaps2D: + """2D Overlaps (e.g. IoUs, GIoUs) Calculator.""" + + def __init__(self, scale=1., dtype=None): + self.scale = scale + self.dtype = dtype + + @torch.no_grad() + def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): + """Calculate IoU between 2D bboxes. + + Args: + bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4) + in format, or shape (m, 5) in format. + bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4) + in format, shape (m, 5) in format, or be empty. If ``is_aligned `` is ``True``, + then m and n must be equal. + mode (str): "iou" (intersection over union), "iof" (intersection + over foreground), or "giou" (generalized intersection over + union). + is_aligned (bool, optional): If True, then m and n must be equal. + Default False. + + Returns: + Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) + """ + assert bboxes1.size(-1) in [0, 4, 5] + assert bboxes2.size(-1) in [0, 4, 5] + if bboxes2.size(-1) == 5: + bboxes2 = bboxes2[..., :4] + if bboxes1.size(-1) == 5: + bboxes1 = bboxes1[..., :4] + + if self.dtype == 'fp16': + # change tensor type to save cpu and cuda memory and keep speed + bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype) + bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype) + overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned) + if not overlaps.is_cuda and overlaps.dtype == torch.float16: + # resume cpu float32 + overlaps = overlaps.float() + return overlaps + + return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned) + + def __repr__(self): + """str: a string describing the module""" + repr_str = self.__class__.__name__ + f'(' \ + f'scale={self.scale}, dtype={self.dtype})' + return repr_str + + +@TASK_UTILS.register_module() +class PoseOKS: + """OKS score Calculator.""" + + def __init__(self, + metainfo: Optional[str] = 'configs/_base_/datasets/coco.py'): + + if metainfo is not None: + metainfo = parse_pose_metainfo(dict(from_file=metainfo)) + sigmas = metainfo.get('sigmas', None) + if sigmas is not None: + self.sigmas = torch.as_tensor(sigmas) + + @torch.no_grad() + def __call__(self, + output: Tensor, + target: Tensor, + target_weights: Tensor, + areas: Tensor, + eps: float = 1e-8) -> Tensor: + + dist = torch.norm(output - target, dim=-1) + areas = areas.reshape(*((1, ) * (dist.ndim - 2)), -1, 1) + dist = dist / areas.pow(0.5).clip(min=eps) + + if hasattr(self, 'sigmas'): + if self.sigmas.device != dist.device: + self.sigmas = self.sigmas.to(dist.device) + sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1) + dist = dist / (sigmas * 2) + + target_weights = target_weights / target_weights.sum( + dim=-1, keepdims=True).clip(min=eps) + oks = (torch.exp(-dist.pow(2) / 2) * target_weights).sum(dim=-1) + return oks diff --git a/mmpose/models/task_modules/assigners/sim_ota_assigner.py b/mmpose/models/task_modules/assigners/sim_ota_assigner.py new file mode 100644 index 0000000000..69c7ed677e --- /dev/null +++ b/mmpose/models/task_modules/assigners/sim_ota_assigner.py @@ -0,0 +1,284 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmpose.registry import TASK_UTILS +from mmpose.utils.typing import ConfigType + +INF = 100000.0 +EPS = 1.0e-7 + + +@TASK_UTILS.register_module() +class SimOTAAssigner: + """Computes matching between predictions and ground truth. + + Args: + center_radius (float): Radius of center area to determine + if a prior is in the center of a gt. Defaults to 2.5. + candidate_topk (int): Top-k ious candidates to calculate dynamic-k. + Defaults to 10. + iou_weight (float): Weight of bbox iou cost. Defaults to 3.0. + cls_weight (float): Weight of classification cost. Defaults to 1.0. + oks_weight (float): Weight of keypoint OKS cost. Defaults to 3.0. + vis_weight (float): Weight of keypoint visibility cost. Defaults to 0.0 + dynamic_k_indicator (str): Cost type for calculating dynamic-k, + either 'iou' or 'oks'. Defaults to 'iou'. + iou_calculator (dict): Config of IoU calculation method. + Defaults to dict(type='BBoxOverlaps2D'). + oks_calculator (dict): Config of OKS calculation method. + Defaults to dict(type='PoseOKS'). + """ + + def __init__(self, + center_radius: float = 2.5, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0, + oks_weight: float = 3.0, + vis_weight: float = 0.0, + dynamic_k_indicator: str = 'iou', + iou_calculator: ConfigType = dict(type='BBoxOverlaps2D'), + oks_calculator: ConfigType = dict(type='PoseOKS')): + self.center_radius = center_radius + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + self.oks_weight = oks_weight + self.vis_weight = vis_weight + assert dynamic_k_indicator in ('iou', 'oks'), f'the argument ' \ + f'`dynamic_k_indicator` should be either \'iou\' or \'oks\', ' \ + f'but got {dynamic_k_indicator}' + self.dynamic_k_indicator = dynamic_k_indicator + + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.oks_calculator = TASK_UTILS.build(oks_calculator) + + def assign(self, pred_instances: InstanceData, gt_instances: InstanceData, + **kwargs) -> dict: + """Assign gt to priors using SimOTA. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + Returns: + dict: Assignment result containing assigned gt indices, + max iou overlaps, assigned labels, etc. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + gt_keypoints = gt_instances.keypoints + gt_keypoints_visible = gt_instances.keypoints_visible + gt_areas = gt_instances.areas + num_gt = gt_bboxes.size(0) + + decoded_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + priors = pred_instances.priors + keypoints = pred_instances.keypoints + keypoints_visible = pred_instances.keypoints_visible + num_bboxes = decoded_bboxes.size(0) + + # assign 0 by default + assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ), + 0, + dtype=torch.long) + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return dict( + num_gts=num_gt, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( + priors, gt_bboxes) + valid_decoded_bbox = decoded_bboxes[valid_mask] + valid_pred_scores = pred_scores[valid_mask] + valid_pred_kpts = keypoints[valid_mask] + valid_pred_kpts_vis = keypoints_visible[valid_mask] + + num_valid = valid_decoded_bbox.size(0) + if num_valid == 0: + # No valid bboxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return dict( + num_gts=num_gt, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + cost_matrix = (~is_in_boxes_and_center) * INF + + # calculate iou + pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) + if self.iou_weight > 0: + iou_cost = -torch.log(pairwise_ious + EPS) + cost_matrix = cost_matrix + iou_cost * self.iou_weight + + # calculate oks + if self.oks_weight > 0 or self.dynamic_k_indicator == 'oks': + pairwise_oks = self.oks_calculator( + valid_pred_kpts.unsqueeze(1), # [num_valid, 1, k, 2] + target=gt_keypoints.unsqueeze(0), # [1, num_gt, k, 2] + target_weights=gt_keypoints_visible.unsqueeze( + 0), # [1, num_gt, k] + areas=gt_areas.unsqueeze(0), # [1, num_gt] + ) # -> [num_valid, num_gt] + + oks_cost = -torch.log(pairwise_oks + EPS) + cost_matrix = cost_matrix + oks_cost * self.oks_weight + + # calculate cls + if self.cls_weight > 0: + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) + valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat( + 1, num_gt, 1) + # disable AMP autocast to avoid overflow + with torch.cuda.amp.autocast(enabled=False): + cls_cost = ( + F.binary_cross_entropy( + valid_pred_scores.to(dtype=torch.float32), + gt_onehot_label, + reduction='none', + ).sum(-1).to(dtype=valid_pred_scores.dtype)) + cost_matrix = cost_matrix + cls_cost * self.cls_weight + # calculate vis + if self.vis_weight > 0: + valid_pred_kpts_vis = valid_pred_kpts_vis.unsqueeze(1).repeat( + 1, num_gt, 1) # [num_valid, 1, k] + gt_kpt_vis = gt_keypoints_visible.unsqueeze( + 0).float() # [1, num_gt, k] + with torch.cuda.amp.autocast(enabled=False): + vis_cost = ( + F.binary_cross_entropy( + valid_pred_kpts_vis.to(dtype=torch.float32), + gt_kpt_vis.repeat(num_valid, 1, 1), + reduction='none', + ).sum(-1).to(dtype=valid_pred_kpts_vis.dtype)) + cost_matrix = cost_matrix + vis_cost * self.vis_weight + + if self.dynamic_k_indicator == 'iou': + matched_pred_ious, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_ious, num_gt, valid_mask) + elif self.dynamic_k_indicator == 'oks': + matched_pred_ious, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_oks, num_gt, valid_mask) + + # convert to AssignResult format + assigned_gt_inds[valid_mask] = matched_gt_inds + 1 + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long() + max_overlaps = assigned_gt_inds.new_full((num_bboxes, ), + -INF, + dtype=torch.float32) + max_overlaps[valid_mask] = matched_pred_ious.to(max_overlaps) + return dict( + num_gts=num_gt, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + def get_in_gt_and_in_center_info(self, priors: Tensor, gt_bboxes: Tensor + ) -> Tuple[Tensor, Tensor]: + """Get the information of which prior is in gt bboxes and gt center + priors.""" + num_gt = gt_bboxes.size(0) + + repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt) + repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt) + repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt) + repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt) + + # is prior centers in gt bboxes, shape: [n_prior, n_gt] + l_ = repeated_x - gt_bboxes[:, 0] + t_ = repeated_y - gt_bboxes[:, 1] + r_ = gt_bboxes[:, 2] - repeated_x + b_ = gt_bboxes[:, 3] - repeated_y + + deltas = torch.stack([l_, t_, r_, b_], dim=1) + is_in_gts = deltas.min(dim=1).values > 0 + is_in_gts_all = is_in_gts.sum(dim=1) > 0 + + # is prior centers in gt centers + gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 + gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 + ct_box_l = gt_cxs - self.center_radius * repeated_stride_x + ct_box_t = gt_cys - self.center_radius * repeated_stride_y + ct_box_r = gt_cxs + self.center_radius * repeated_stride_x + ct_box_b = gt_cys + self.center_radius * repeated_stride_y + + cl_ = repeated_x - ct_box_l + ct_ = repeated_y - ct_box_t + cr_ = ct_box_r - repeated_x + cb_ = ct_box_b - repeated_y + + ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1) + is_in_cts = ct_deltas.min(dim=1).values > 0 + is_in_cts_all = is_in_cts.sum(dim=1) > 0 + + # in boxes or in centers, shape: [num_priors] + is_in_gts_or_centers = is_in_gts_all | is_in_cts_all + + # both in boxes and centers, shape: [num_fg, num_gt] + is_in_boxes_and_centers = ( + is_in_gts[is_in_gts_or_centers, :] + & is_in_cts[is_in_gts_or_centers, :]) + return is_in_gts_or_centers, is_in_boxes_and_centers + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + num_gt: int, + valid_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets.""" + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + valid_mask[valid_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(1)[fg_mask_inboxes] + return matched_pred_ious, matched_gt_inds diff --git a/mmpose/models/task_modules/prior_generators/__init__.py b/mmpose/models/task_modules/prior_generators/__init__.py new file mode 100644 index 0000000000..e153da8447 --- /dev/null +++ b/mmpose/models/task_modules/prior_generators/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mlvl_point_generator import MlvlPointGenerator # noqa diff --git a/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py b/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py new file mode 100644 index 0000000000..7dc6a6199b --- /dev/null +++ b/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py @@ -0,0 +1,245 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import numpy as np +import torch +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmpose.registry import TASK_UTILS + +DeviceType = Union[str, torch.device] + + +@TASK_UTILS.register_module() +class MlvlPointGenerator: + """Standard points generator for multi-level (Mlvl) feature maps in 2D + points-based detectors. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels in order (w, h). + offset (float): The offset of points, the value is normalized with + corresponding stride. Defaults to 0.5. + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + offset: float = 0.5) -> None: + self.strides = [_pair(stride) for stride in strides] + self.offset = offset + + @property + def num_levels(self) -> int: + """int: number of feature levels that the generator will be applied""" + return len(self.strides) + + @property + def num_base_priors(self) -> List[int]: + """list[int]: The number of priors (points) at a point + on the feature grid""" + return [1 for _ in range(len(self.strides))] + + def _meshgrid(self, + x: Tensor, + y: Tensor, + row_major: bool = True) -> Tuple[Tensor, Tensor]: + yy, xx = torch.meshgrid(y, x) + if row_major: + # warning .flatten() would cause error in ONNX exporting + # have to use reshape here + return xx.reshape(-1), yy.reshape(-1) + + else: + return yy.reshape(-1), xx.reshape(-1) + + def grid_priors(self, + featmap_sizes: List[Tuple], + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda', + with_stride: bool = False) -> List[Tensor]: + """Generate grid points of multiple feature levels. + + Args: + featmap_sizes (list[tuple]): List of feature map sizes in + multiple feature levels, each size arrange as + as (h, w). + dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32. + device (str | torch.device): The device where the anchors will be + put on. + with_stride (bool): Whether to concatenate the stride to + the last dimension of points. + + Return: + list[torch.Tensor]: Points of multiple feature levels. + The sizes of each tensor should be (N, 2) when with stride is + ``False``, where N = width * height, width and height + are the sizes of the corresponding feature level, + and the last dimension 2 represent (coord_x, coord_y), + otherwise the shape should be (N, 4), + and the last dimension 4 represent + (coord_x, coord_y, stride_w, stride_h). + """ + + assert self.num_levels == len(featmap_sizes) + multi_level_priors = [] + for i in range(self.num_levels): + priors = self.single_level_grid_priors( + featmap_sizes[i], + level_idx=i, + dtype=dtype, + device=device, + with_stride=with_stride) + multi_level_priors.append(priors) + return multi_level_priors + + def single_level_grid_priors(self, + featmap_size: Tuple[int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda', + with_stride: bool = False) -> Tensor: + """Generate grid Points of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int]): Size of the feature maps, arrange as + (h, w). + level_idx (int): The index of corresponding feature map level. + dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32. + device (str | torch.device): The device the tensor will be put on. + Defaults to 'cuda'. + with_stride (bool): Concatenate the stride to the last dimension + of points. + + Return: + Tensor: Points of single feature levels. + The shape of tensor should be (N, 2) when with stride is + ``False``, where N = width * height, width and height + are the sizes of the corresponding feature level, + and the last dimension 2 represent (coord_x, coord_y), + otherwise the shape should be (N, 4), + and the last dimension 4 represent + (coord_x, coord_y, stride_w, stride_h). + """ + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + shift_x = (torch.arange(0, feat_w, device=device) + + self.offset) * stride_w + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_x = shift_x.to(dtype) + + shift_y = (torch.arange(0, feat_h, device=device) + + self.offset) * stride_h + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_y = shift_y.to(dtype) + + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + if not with_stride: + shifts = torch.stack([shift_xx, shift_yy], dim=-1) + else: + # use `shape[0]` instead of `len(shift_xx)` for ONNX export + stride_w = shift_xx.new_full((shift_xx.shape[0], ), + stride_w).to(dtype) + stride_h = shift_xx.new_full((shift_yy.shape[0], ), + stride_h).to(dtype) + shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h], + dim=-1) + all_points = shifts.to(device) + return all_points + + def valid_flags(self, + featmap_sizes: List[Tuple[int, int]], + pad_shape: Tuple[int], + device: DeviceType = 'cuda') -> List[Tensor]: + """Generate valid flags of points of multiple feature levels. + + Args: + featmap_sizes (list(tuple)): List of feature map sizes in + multiple feature levels, each size arrange as + as (h, w). + pad_shape (tuple(int)): The padded shape of the image, + arrange as (h, w). + device (str | torch.device): The device where the anchors will be + put on. + + Return: + list(torch.Tensor): Valid flags of points of multiple levels. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_flags = [] + for i in range(self.num_levels): + point_stride = self.strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = pad_shape[:2] + valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h) + valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w) + flags = self.single_level_valid_flags((feat_h, feat_w), + (valid_feat_h, valid_feat_w), + device=device) + multi_level_flags.append(flags) + return multi_level_flags + + def single_level_valid_flags(self, + featmap_size: Tuple[int, int], + valid_size: Tuple[int, int], + device: DeviceType = 'cuda') -> Tensor: + """Generate the valid flags of points of a single feature map. + + Args: + featmap_size (tuple[int]): The size of feature maps, arrange as + as (h, w). + valid_size (tuple[int]): The valid size of the feature maps. + The size arrange as as (h, w). + device (str | torch.device): The device where the flags will be + put on. Defaults to 'cuda'. + + Returns: + torch.Tensor: The valid flags of each points in a single level \ + feature map. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + return valid + + def sparse_priors(self, + prior_idxs: Tensor, + featmap_size: Tuple[int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> Tensor: + """Generate sparse points according to the ``prior_idxs``. + + Args: + prior_idxs (Tensor): The index of corresponding anchors + in the feature map. + featmap_size (tuple[int]): feature map size arrange as (w, h). + level_idx (int): The level index of corresponding feature + map. + dtype (obj:`torch.dtype`): Date type of points. Defaults to + ``torch.float32``. + device (str | torch.device): The device where the points is + located. + Returns: + Tensor: Anchor with shape (N, 2), N should be equal to + the length of ``prior_idxs``. And last dimension + 2 represent (coord_x, coord_y). + """ + height, width = featmap_size + x = (prior_idxs % width + self.offset) * self.strides[level_idx][0] + y = ((prior_idxs // width) % height + + self.offset) * self.strides[level_idx][1] + prioris = torch.stack([x, y], 1).to(dtype) + prioris = prioris.to(device) + return prioris diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py index 22d8a89b41..545fc4c64d 100644 --- a/mmpose/models/utils/__init__.py +++ b/mmpose/models/utils/__init__.py @@ -1,10 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. from .check_and_update_config import check_and_update_config from .ckpt_convert import pvt_convert +from .csp_layer import CSPLayer +from .misc import filter_scores_and_topk from .rtmcc_block import RTMCCBlock, rope from .transformer import PatchEmbed, nchw_to_nlc, nlc_to_nchw __all__ = [ 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert', 'RTMCCBlock', - 'rope', 'check_and_update_config' + 'rope', 'check_and_update_config', 'filter_scores_and_topk', 'CSPLayer' ] diff --git a/mmpose/models/utils/csp_layer.py b/mmpose/models/utils/csp_layer.py new file mode 100644 index 0000000000..071e1209a2 --- /dev/null +++ b/mmpose/models/utils/csp_layer.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from mmengine.utils import digit_version +from torch import Tensor + +from mmpose.utils.typing import ConfigType, OptConfigType, OptMultiConfig + + +class ChannelAttention(BaseModule): + """Channel attention Module. + + Args: + channels (int): The input (and output) channels of the attention layer. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None + """ + + def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True) + if digit_version(torch.__version__) < (1, 7, 0): + self.act = nn.Hardsigmoid() + else: + self.act = nn.Hardsigmoid(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward function for ChannelAttention.""" + with torch.cuda.amp.autocast(enabled=False): + out = self.global_avgpool(x) + out = self.fc(out) + out = self.act(out) + return x * out + + +class DarknetBottleneck(BaseModule): + """The basic bottleneck block used in Darknet. + + Each ResBlock consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and LeakyReLU. + The first convLayer has filter size of 1x1 and the second one has the + filter size of 3x3. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): The kernel size of the convolution. + Defaults to 0.5. + add_identity (bool): Whether to add identity to the out. + Defaults to True. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + add_identity: bool = True, + use_depthwise: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.conv1 = ConvModule( + in_channels, + hidden_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = conv( + hidden_channels, + out_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPNeXtBlock(BaseModule): + """The basic bottleneck block used in CSPNeXt. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): Expand ratio of the hidden channel. Defaults to 0.5. + add_identity (bool): Whether to add identity to the out. Only works + when in_channels == out_channels. Defaults to True. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + kernel_size (int): The kernel size of the second convolution layer. + Defaults to 5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU'). + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + add_identity: bool = True, + use_depthwise: bool = False, + kernel_size: int = 5, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.conv1 = conv( + in_channels, + hidden_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = DepthwiseSeparableConvModule( + hidden_channels, + out_channels, + kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPLayer(BaseModule): + """Cross Stage Partial Layer. + + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + num_blocks (int): Number of blocks. Defaults to 1. + add_identity (bool): Whether to add identity in blocks. + Defaults to True. + use_cspnext_block (bool): Whether to use CSPNeXt block. + Defaults to False. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish') + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + num_blocks: int = 1, + add_identity: bool = True, + use_depthwise: bool = False, + use_cspnext_block: bool = False, + channel_attention: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck + mid_channels = int(out_channels * expand_ratio) + self.channel_attention = channel_attention + self.main_conv = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.short_conv = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.final_conv = ConvModule( + 2 * mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.Sequential(*[ + block( + mid_channels, + mid_channels, + 1.0, + add_identity, + use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) for _ in range(num_blocks) + ]) + if channel_attention: + self.attention = ChannelAttention(2 * mid_channels) + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + x_short = self.short_conv(x) + + x_main = self.main_conv(x) + x_main = self.blocks(x_main) + + x_final = torch.cat((x_main, x_short), dim=1) + + if self.channel_attention: + x_final = self.attention(x_final) + return self.final_conv(x_final) diff --git a/mmpose/models/utils/misc.py b/mmpose/models/utils/misc.py new file mode 100644 index 0000000000..347c521709 --- /dev/null +++ b/mmpose/models/utils/misc.py @@ -0,0 +1,76 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + +import torch +from six.moves import map, zip + + +def multi_apply(func, *args, **kwargs): + """Apply function to a list of arguments. + + Note: + This function applies the ``func`` to multiple inputs and + map the multiple outputs of the ``func`` into different + list. Each list contains the same type of outputs corresponding + to different inputs. + + Args: + func (Function): A function that will be applied to a list of + arguments + + Returns: + tuple(list): A tuple containing multiple list, each list contains + a kind of returned results by the function + """ + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def filter_scores_and_topk(scores, score_thr, topk, results=None): + """Filter results using score threshold and topk candidates. + + Args: + scores (Tensor): The scores, shape (num_bboxes, K). + score_thr (float): The score filter threshold. + topk (int): The number of topk candidates. + results (dict or list or Tensor, Optional): The results to + which the filtering rule is to be applied. The shape + of each item is (num_bboxes, N). + + Returns: + tuple: Filtered results + + - scores (Tensor): The scores after being filtered, \ + shape (num_bboxes_filtered, ). + - labels (Tensor): The class labels, shape \ + (num_bboxes_filtered, ). + - anchor_idxs (Tensor): The anchor indexes, shape \ + (num_bboxes_filtered, ). + - filtered_results (dict or list or Tensor, Optional): \ + The filtered results. The shape of each item is \ + (num_bboxes_filtered, N). + """ + valid_mask = scores > score_thr + scores = scores[valid_mask] + valid_idxs = torch.nonzero(valid_mask) + + num_topk = min(topk, valid_idxs.size(0)) + # torch.sort is actually faster than .topk (at least on GPUs) + scores, idxs = scores.sort(descending=True) + scores = scores[:num_topk] + topk_idxs = valid_idxs[idxs[:num_topk]] + keep_idxs, labels = topk_idxs.unbind(dim=1) + + filtered_results = None + if results is not None: + if isinstance(results, dict): + filtered_results = {k: v[keep_idxs] for k, v in results.items()} + elif isinstance(results, list): + filtered_results = [result[keep_idxs] for result in results] + elif isinstance(results, torch.Tensor): + filtered_results = results[keep_idxs] + else: + raise NotImplementedError(f'Only supports dict or list or Tensor, ' + f'but get {type(results)}.') + return scores, labels, keep_idxs, filtered_results diff --git a/mmpose/registry.py b/mmpose/registry.py index 3e8ab4f544..84903eaf2d 100644 --- a/mmpose/registry.py +++ b/mmpose/registry.py @@ -91,7 +91,7 @@ PARAM_SCHEDULERS = Registry( 'parameter scheduler', parent=MMENGINE_PARAM_SCHEDULERS, - locations=['mmpose.engine']) + locations=['mmpose.engine.schedulers']) # manage all kinds of metrics METRICS = Registry( @@ -104,7 +104,9 @@ # manage task-specific modules like anchor generators and box coders TASK_UTILS = Registry( - 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmpose.models']) + 'task util', + parent=MMENGINE_TASK_UTILS, + locations=['mmpose.models.task_modules']) # Registries For Visualizer and the related # manage visualizer diff --git a/mmpose/structures/__init__.py b/mmpose/structures/__init__.py index e4384af1cd..15c3e2d278 100644 --- a/mmpose/structures/__init__.py +++ b/mmpose/structures/__init__.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .bbox import (bbox_cs2xywh, bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy, - bbox_xyxy2cs, bbox_xyxy2xywh, flip_bbox, - get_udp_warp_matrix, get_warp_matrix) -from .keypoint import flip_keypoints +from .bbox import (bbox_clip_border, bbox_corner2xyxy, bbox_cs2xywh, + bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy, + bbox_xyxy2corner, bbox_xyxy2cs, bbox_xyxy2xywh, flip_bbox, + get_pers_warp_matrix, get_udp_warp_matrix, get_warp_matrix) +from .keypoint import flip_keypoints, keypoint_clip_border from .multilevel_pixel_data import MultilevelPixelData from .pose_data_sample import PoseDataSample from .utils import merge_data_samples, revert_heatmap, split_instances @@ -11,5 +12,7 @@ 'PoseDataSample', 'MultilevelPixelData', 'bbox_cs2xywh', 'bbox_cs2xyxy', 'bbox_xywh2cs', 'bbox_xywh2xyxy', 'bbox_xyxy2cs', 'bbox_xyxy2xywh', 'flip_bbox', 'get_udp_warp_matrix', 'get_warp_matrix', 'flip_keypoints', - 'merge_data_samples', 'revert_heatmap', 'split_instances' + 'merge_data_samples', 'revert_heatmap', 'split_instances', + 'keypoint_clip_border', 'bbox_clip_border', 'bbox_xyxy2corner', + 'bbox_corner2xyxy', 'get_pers_warp_matrix' ] diff --git a/mmpose/structures/bbox/__init__.py b/mmpose/structures/bbox/__init__.py index a3e723918c..abd3d5f2d9 100644 --- a/mmpose/structures/bbox/__init__.py +++ b/mmpose/structures/bbox/__init__.py @@ -1,10 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .transforms import (bbox_cs2xywh, bbox_cs2xyxy, bbox_xywh2cs, - bbox_xywh2xyxy, bbox_xyxy2cs, bbox_xyxy2xywh, - flip_bbox, get_udp_warp_matrix, get_warp_matrix) +from .bbox_overlaps import bbox_overlaps +from .transforms import (bbox_clip_border, bbox_corner2xyxy, bbox_cs2xywh, + bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy, + bbox_xyxy2corner, bbox_xyxy2cs, bbox_xyxy2xywh, + flip_bbox, get_pers_warp_matrix, get_udp_warp_matrix, + get_warp_matrix) __all__ = [ 'bbox_cs2xywh', 'bbox_cs2xyxy', 'bbox_xywh2cs', 'bbox_xywh2xyxy', 'bbox_xyxy2cs', 'bbox_xyxy2xywh', 'flip_bbox', 'get_udp_warp_matrix', - 'get_warp_matrix' + 'get_warp_matrix', 'bbox_overlaps', 'bbox_clip_border', 'bbox_xyxy2corner', + 'bbox_corner2xyxy', 'get_pers_warp_matrix' ] diff --git a/mmpose/structures/bbox/bbox_overlaps.py b/mmpose/structures/bbox/bbox_overlaps.py new file mode 100644 index 0000000000..682008c337 --- /dev/null +++ b/mmpose/structures/bbox/bbox_overlaps.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def fp16_clamp(x, min_val=None, max_val=None): + if not x.is_cuda and x.dtype == torch.float16: + return x.float().clamp(min_val, max_val).half() + return x.clamp(min_val, max_val) + + +def bbox_overlaps(bboxes1, + bboxes2, + mode='iou', + is_aligned=False, + eps=1e-6) -> torch.Tensor: + """Calculate overlap between two sets of bounding boxes. + + Args: + bboxes1 (torch.Tensor): Bounding boxes of shape (..., m, 4) or empty. + bboxes2 (torch.Tensor): Bounding boxes of shape (..., n, 4) or empty. + mode (str): "iou" (intersection over union), + "iof" (intersection over foreground), + or "giou" (generalized intersection over union). + Defaults to "iou". + is_aligned (bool, optional): If True, then m and n must be equal. + Default False. + eps (float, optional): A small constant added to the denominator for + numerical stability. Default 1e-6. + + Returns: + torch.Tensor: Overlap values of shape (..., m, n) if is_aligned is + False, else shape (..., m). + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> overlaps = bbox_overlaps(bboxes1, bboxes2) + >>> assert overlaps.shape == (3, 3) + >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) + >>> assert overlaps.shape == (3, ) + """ + assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}' + assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) + + if bboxes1.ndim == 1: + bboxes1 = bboxes1.unsqueeze(0) + if bboxes2.ndim == 1: + bboxes2 = bboxes2.unsqueeze(0) + + assert bboxes1.shape[:-2] == bboxes2.shape[:-2] + batch_shape = bboxes1.shape[:-2] + + rows = bboxes1.size(-2) + cols = bboxes2.size(-2) + if is_aligned: + assert rows == cols + + if rows * cols == 0: + if is_aligned: + return bboxes1.new(batch_shape + (rows, )) + else: + return bboxes1.new(batch_shape + (rows, cols)) + + area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( + bboxes1[..., 3] - bboxes1[..., 1]) + area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( + bboxes2[..., 3] - bboxes2[..., 1]) + + if is_aligned: + lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) + rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) + wh = fp16_clamp(rb - lt, min_val=0) + overlap = wh[..., 0] * wh[..., 1] + + if mode in ['iou', 'giou']: + union = area1 + area2 - overlap + else: + union = area1 + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2]) + enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:]) + else: + lt = torch.max(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) + rb = torch.min(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) + wh = fp16_clamp(rb - lt, min_val=0) + overlap = wh[..., 0] * wh[..., 1] + + if mode in ['iou', 'giou']: + union = area1[..., None] + area2[..., None, :] - overlap + else: + union = area1[..., None] + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :, None, :2], + bboxes2[..., None, :, :2]) + enclosed_rb = torch.max(bboxes1[..., :, None, 2:], + bboxes2[..., None, :, 2:]) + + eps_tensor = union.new_tensor([eps]) + union = torch.max(union, eps_tensor) + ious = overlap / union + if mode in ['iou', 'iof']: + return ious + elif mode == 'giou': + enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min_val=0) + enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] + enclose_area = torch.max(enclose_area, eps_tensor) + gious = ious - (enclose_area - union) / enclose_area + return gious diff --git a/mmpose/structures/bbox/transforms.py b/mmpose/structures/bbox/transforms.py index 11524abc1e..7ddd821ace 100644 --- a/mmpose/structures/bbox/transforms.py +++ b/mmpose/structures/bbox/transforms.py @@ -63,9 +63,8 @@ def bbox_xyxy2cs(bbox: np.ndarray, if dim == 1: bbox = bbox[None, :] - x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3]) - center = np.hstack([x1 + x2, y1 + y2]) * 0.5 - scale = np.hstack([x2 - x1, y2 - y1]) * padding + scale = (bbox[..., 2:] - bbox[..., :2]) * padding + center = (bbox[..., 2:] + bbox[..., :2]) * 0.5 if dim == 1: center = center[0] @@ -172,6 +171,103 @@ def bbox_cs2xywh(center: np.ndarray, return bbox +def bbox_xyxy2corner(bbox: np.ndarray): + """Convert bounding boxes from xyxy format to corner format. + + Given a numpy array containing bounding boxes in the format + (xmin, ymin, xmax, ymax), this function converts the bounding + boxes to the corner format, where each box is represented by four + corner points (top-left, top-right, bottom-right, bottom-left). + + Args: + bbox (numpy.ndarray): Input array of shape (N, 4) representing + N bounding boxes. + + Returns: + numpy.ndarray: An array of shape (N, 4, 2) containing the corner + points of the bounding boxes. + + Example: + bbox = np.array([[0, 0, 100, 50], [10, 20, 200, 150]]) + corners = bbox_xyxy2corner(bbox) + """ + dim = bbox.ndim + if dim == 1: + bbox = bbox[None] + + bbox = np.tile(bbox, 2).reshape(-1, 4, 2) + bbox[:, 1:3, 0] = bbox[:, 0:2, 0] + + if dim == 1: + bbox = bbox[0] + + return bbox + + +def bbox_corner2xyxy(bbox: np.ndarray): + """Convert bounding boxes from corner format to xyxy format. + + Given a numpy array containing bounding boxes in the corner + format (four corner points for each box), this function converts + the bounding boxes to the (xmin, ymin, xmax, ymax) format. + + Args: + bbox (numpy.ndarray): Input array of shape (N, 4, 2) representing + N bounding boxes. + + Returns: + numpy.ndarray: An array of shape (N, 4) containing the bounding + boxes in xyxy format. + + Example: + corners = np.array([[[0, 0], [100, 0], [100, 50], [0, 50]], + [[10, 20], [200, 20], [200, 150], [10, 150]]]) + bbox = bbox_corner2xyxy(corners) + """ + if bbox.shape[-1] == 8: + bbox = bbox.reshape(*bbox.shape[:-1], 4, 2) + + dim = bbox.ndim + if dim == 2: + bbox = bbox[None] + + bbox = np.concatenate((bbox.min(axis=1), bbox.max(axis=1)), axis=1) + + if dim == 2: + bbox = bbox[0] + + return bbox + + +def bbox_clip_border(bbox: np.ndarray, shape: Tuple[int, int]) -> np.ndarray: + """Clip bounding box coordinates to fit within a specified shape. + + Args: + bbox (np.ndarray): Bounding box coordinates of shape (..., 4) + or (..., 2). + shape (Tuple[int, int]): Shape of the image to which bounding + boxes are being clipped in the format of (w, h) + + Returns: + np.ndarray: Clipped bounding box coordinates. + + Example: + >>> bbox = np.array([[10, 20, 30, 40], [40, 50, 80, 90]]) + >>> shape = (50, 50) # Example image shape + >>> clipped_bbox = bbox_clip_border(bbox, shape) + """ + width, height = shape[:2] + + if bbox.shape[-1] == 2: + bbox[..., 0] = np.clip(bbox[..., 0], a_min=0, a_max=width) + bbox[..., 1] = np.clip(bbox[..., 1], a_min=0, a_max=height) + else: + bbox[..., ::2] = np.clip(bbox[..., ::2], a_min=0, a_max=width) + bbox[..., 1::2] = np.clip(bbox[..., 1::2], a_min=0, a_max=height) + + return bbox + + def flip_bbox(bbox: np.ndarray, image_size: Tuple[int, int], bbox_format: str = 'xywh', @@ -328,6 +424,61 @@ def get_warp_matrix(center: np.ndarray, return warp_mat +def get_pers_warp_matrix(center: np.ndarray, translate: np.ndarray, + scale: float, rot: float, + shear: np.ndarray) -> np.ndarray: + """Compute a perspective warp matrix based on specified transformations. + + Args: + center (np.ndarray): Center of the transformation. + translate (np.ndarray): Translation vector. + scale (float): Scaling factor. + rot (float): Rotation angle in degrees. + shear (np.ndarray): Shearing angles in degrees along x and y axes. + + Returns: + np.ndarray: Perspective warp matrix. + + Example: + >>> center = np.array([0, 0]) + >>> translate = np.array([10, 20]) + >>> scale = 1.2 + >>> rot = 30.0 + >>> shear = np.array([15.0, 0.0]) + >>> warp_matrix = get_pers_warp_matrix(center, translate, + scale, rot, shear) + """ + translate_mat = np.array([[1, 0, translate[0] + center[0]], + [0, 1, translate[1] + center[1]], [0, 0, 1]], + dtype=np.float32) + + shear_x = math.radians(shear[0]) + shear_y = math.radians(shear[1]) + shear_mat = np.array([[1, np.tan(shear_x), 0], [np.tan(shear_y), 1, 0], + [0, 0, 1]], + dtype=np.float32) + + rotate_angle = math.radians(rot) + rotate_mat = np.array([[np.cos(rotate_angle), -np.sin(rotate_angle), 0], + [np.sin(rotate_angle), + np.cos(rotate_angle), 0], [0, 0, 1]], + dtype=np.float32) + + scale_mat = np.array([[scale, 0, 0], [0, scale, 0], [0, 0, 1]], + dtype=np.float32) + + recover_center_mat = np.array([[1, 0, -center[0]], [0, 1, -center[1]], + [0, 0, 1]], + dtype=np.float32) + + warp_matrix = np.dot( + np.dot( + np.dot(np.dot(translate_mat, shear_mat), rotate_mat), scale_mat), + recover_center_mat) + + return warp_matrix + + def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray: """Rotate a point by an angle. diff --git a/mmpose/structures/keypoint/__init__.py b/mmpose/structures/keypoint/__init__.py index 12ee96cf7c..f4969d3283 100644 --- a/mmpose/structures/keypoint/__init__.py +++ b/mmpose/structures/keypoint/__init__.py @@ -1,5 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .transforms import flip_keypoints, flip_keypoints_custom_center +from .transforms import (flip_keypoints, flip_keypoints_custom_center, + keypoint_clip_border) -__all__ = ['flip_keypoints', 'flip_keypoints_custom_center'] +__all__ = [ + 'flip_keypoints', 'flip_keypoints_custom_center', 'keypoint_clip_border' +] diff --git a/mmpose/structures/keypoint/transforms.py b/mmpose/structures/keypoint/transforms.py index bd7274dadf..b4a2aff925 100644 --- a/mmpose/structures/keypoint/transforms.py +++ b/mmpose/structures/keypoint/transforms.py @@ -121,3 +121,33 @@ def flip_keypoints_custom_center(keypoints: np.ndarray, # Flip horizontally keypoints_flipped[..., 0] = x_c * 2 - keypoints_flipped[..., 0] return keypoints_flipped, keypoints_visible_flipped + + +def keypoint_clip_border(keypoints: np.ndarray, keypoints_visible: np.ndarray, + shape: Tuple[int, + int]) -> Tuple[np.ndarray, np.ndarray]: + """Set the visibility values for keypoints outside the image border. + + Args: + keypoints (np.ndarray): Input keypoints coordinates. + keypoints_visible (np.ndarray): Visibility values of keypoints. + shape (Tuple[int, int]): Shape of the image to which keypoints are + being clipped in the format of (w, h). + + Note: + This function sets the visibility values of keypoints that fall outside + the specified frame border to zero (0.0). + """ + width, height = shape[:2] + + # Create a mask for keypoints outside the frame + outside_mask = ((keypoints[..., 0] > width) | (keypoints[..., 0] < 0) | + (keypoints[..., 1] > height) | (keypoints[..., 1] < 0)) + + # Update visibility values for keypoints outside the frame + if keypoints_visible.ndim == 2: + keypoints_visible[outside_mask] = 0.0 + elif keypoints_visible.ndim == 3: + keypoints_visible[outside_mask, 0] = 0.0 + + return keypoints, keypoints_visible diff --git a/mmpose/utils/__init__.py b/mmpose/utils/__init__.py index c48ca01cea..fb9c018ed0 100644 --- a/mmpose/utils/__init__.py +++ b/mmpose/utils/__init__.py @@ -2,6 +2,7 @@ from .camera import SimpleCamera, SimpleCameraTorch from .collect_env import collect_env from .config_utils import adapt_mmdet_pipeline +from .dist_utils import reduce_mean from .logger import get_root_logger from .setup_env import register_all_modules, setup_multi_processes from .timer import StopWatch @@ -9,5 +10,5 @@ __all__ = [ 'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes', 'register_all_modules', 'SimpleCamera', 'SimpleCameraTorch', - 'adapt_mmdet_pipeline' + 'adapt_mmdet_pipeline', 'reduce_mean' ] diff --git a/mmpose/utils/dist_utils.py b/mmpose/utils/dist_utils.py new file mode 100644 index 0000000000..915f92585a --- /dev/null +++ b/mmpose/utils/dist_utils.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.distributed as dist + + +def reduce_mean(tensor): + """"Obtain the mean of tensor on different GPUs.""" + if not (dist.is_available() and dist.is_initialized()): + return tensor + tensor = tensor.clone() + dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) + return tensor diff --git a/mmpose/utils/tensor_utils.py b/mmpose/utils/tensor_utils.py index 1be73f8991..755e26854c 100644 --- a/mmpose/utils/tensor_utils.py +++ b/mmpose/utils/tensor_utils.py @@ -29,6 +29,9 @@ def to_numpy(x: Union[Tensor, Sequence[Tensor]], if isinstance(x, Tensor): arrays = x.detach().cpu().numpy() device = x.device + elif isinstance(x, np.ndarray) or is_seq_of(x, np.ndarray): + arrays = x + device = 'cpu' elif is_seq_of(x, Tensor): if unzip: # convert (A, B) -> [(A[0], B[0]), (A[1], B[1]), ...] diff --git a/model-index.yml b/model-index.yml index 52c4a1adb8..8dc3f25054 100644 --- a/model-index.yml +++ b/model-index.yml @@ -77,6 +77,7 @@ Import: - configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml - configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml - configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml +- configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml - configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml - configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml - configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml diff --git a/projects/yolox_pose/README.md b/projects/yolox_pose/README.md index 264b65fe9f..35a830487c 100644 --- a/projects/yolox_pose/README.md +++ b/projects/yolox_pose/README.md @@ -4,16 +4,24 @@ This project implements a YOLOX-based human pose estimator, utilizing the approa
+📌 For improved performance and compatibility, **consider using YOLOX-Pose which is built into MMPose**, which seamlessly integrates with MMPose's tools. To learn more about adopting YOLOX-Pose in your workflow, see the documentation: [YOLOX-Pose](/configs/body_2d_keypoint/yoloxpose/README.md). + ## Usage ### Prerequisites - Python 3.7 or higher + - PyTorch 1.6 or higher + - [MMEngine](https://github.com/open-mmlab/mmengine) v0.6.0 or higher + - [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4 or higher + - [MMDetection](https://github.com/open-mmlab/mmdetection) v3.0.0rc6 or higher -- [MMYOLO](https://github.com/open-mmlab/mmyolo) v0.5.0 or higher + +- [MMYOLO](https://github.com/open-mmlab/mmyolo) **v0.5.0** + - [MMPose](https://github.com/open-mmlab/mmpose) v1.0.0rc1 or higher All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. **In `yolox-pose/` root directory**, run the following line to add the current directory to `PYTHONPATH`: diff --git a/projects/yolox_pose/datasets/__init__.py b/projects/yolox_pose/datasets/__init__.py index 69bae9de53..abf0d11d23 100644 --- a/projects/yolox_pose/datasets/__init__.py +++ b/projects/yolox_pose/datasets/__init__.py @@ -1,3 +1,13 @@ +import mmengine +import mmyolo + +compatible_version = '0.5.0' +if mmengine.digit_version(mmyolo.__version__)[1] > \ + mmengine.digit_version(compatible_version)[1]: + print(f'This project is only compatible with mmyolo {compatible_version} ' + f'or lower. Please install the required version via:' + f'pip install mmyolo=={compatible_version}') + from .bbox_keypoint_structure import * # noqa from .coco_dataset import * # noqa from .transforms import * # noqa diff --git a/projects/yolox_pose/models/__init__.py b/projects/yolox_pose/models/__init__.py index 0d4804e70a..c81450826d 100644 --- a/projects/yolox_pose/models/__init__.py +++ b/projects/yolox_pose/models/__init__.py @@ -1,3 +1,13 @@ +import mmengine +import mmyolo + +compatible_version = '0.5.0' +if mmengine.digit_version(mmyolo.__version__)[1] > \ + mmengine.digit_version(compatible_version)[1]: + print(f'This project is only compatible with mmyolo {compatible_version} ' + f'or lower. Please install the required version via:' + f'pip install mmyolo=={compatible_version}') + from .assigner import * # noqa from .data_preprocessor import * # noqa from .oks_loss import * # noqa diff --git a/tests/test_codecs/test_annotation_processors.py b/tests/test_codecs/test_annotation_processors.py new file mode 100644 index 0000000000..4b67cf4f1a --- /dev/null +++ b/tests/test_codecs/test_annotation_processors.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import numpy as np + +from mmpose.codecs import YOLOXPoseAnnotationProcessor + + +class TestYOLOXPoseAnnotationProcessor(TestCase): + + def test_encode(self): + processor = YOLOXPoseAnnotationProcessor(expand_bbox=True) + + keypoints = np.array([[[0, 1], [2, 6], [4, 5]], [[5, 6], [7, 8], + [8, 9]]]) + keypoints_visible = np.array([[1, 1, 0], [1, 0, 1]]) + bbox = np.array([[0, 1, 3, 4], [1, 2, 5, 6]]) + category_id = [1, 2] + + encoded = processor.encode(keypoints, keypoints_visible, bbox, + category_id) + + self.assertTrue('bbox' in encoded) + self.assertTrue('bbox_labels' in encoded) + self.assertTrue( + np.array_equal(encoded['bbox'], + np.array([[0., 1., 3., 6.], [1., 2., 8., 9.]]))) + self.assertTrue( + np.array_equal(encoded['bbox_labels'], np.array([0, 1]))) + + def test_decode(self): + # make sure the `decode` method has been defined + processor = YOLOXPoseAnnotationProcessor() + _ = processor.decode(dict()) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py index ae00a64393..57031cdacd 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py @@ -42,6 +42,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py index de78264dae..1706fba739 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py @@ -42,6 +42,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py index 8d63925257..0525e35d02 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py @@ -42,6 +42,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py index d7aa46b067..2f27e06698 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py @@ -42,6 +42,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py index e93a524611..bdf5f3b807 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py @@ -45,6 +45,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py index f6431af429..2c35c4490a 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py @@ -44,6 +44,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py index ef3cd82dfb..8dabbaa0d5 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py @@ -42,6 +42,7 @@ def check_data_info_keys(self, keypoints=np.ndarray, keypoints_visible=np.ndarray, invalid_segs=list, + area=(list, np.ndarray), id=list) else: raise ValueError(f'Invalid data_mode {data_mode}') diff --git a/tests/test_datasets/test_transforms/test_common_transforms.py b/tests/test_datasets/test_transforms/test_common_transforms.py index ac221c2af5..fe81b9a94c 100644 --- a/tests/test_datasets/test_transforms/test_common_transforms.py +++ b/tests/test_datasets/test_transforms/test_common_transforms.py @@ -4,15 +4,17 @@ from copy import deepcopy from unittest import TestCase +import mmcv import numpy as np from mmcv.transforms import Compose, LoadImageFromFile from mmengine.utils import is_list_of -from mmpose.datasets.transforms import (Albumentation, GenerateTarget, - GetBBoxCenterScale, +from mmpose.datasets.transforms import (Albumentation, FilterAnnotations, + GenerateTarget, GetBBoxCenterScale, PhotometricDistortion, RandomBBoxTransform, RandomFlip, - RandomHalfBody, TopdownAffine) + RandomHalfBody, TopdownAffine, + YOLOXHSVRandomAug) from mmpose.testing import get_coco_sample @@ -600,3 +602,134 @@ def test_errors(self): with self.assertWarnsRegex(DeprecationWarning, '`target_type` is deprecated'): _ = GenerateTarget(encoder=encoder, target_type='heatmap') + + +class TestFilterAnnotations(TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test + method.""" + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'bbox': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]]), + 'bbox_score': + np.array([0.9, 0.8, 0.7]), + 'category_id': + np.array([1, 2, 3]), + 'keypoints': + np.array([[15, 15, 1], [25, 25, 1], [45, 45, 1]]), + 'keypoints_visible': + np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]]), + 'area': + np.array([300, 600, 1200]), + } + + def test_transform(self): + # Test keep_empty = True + transform = FilterAnnotations( + min_gt_bbox_wh=(50, 50), + keep_empty=True, + by_box=True, + ) + results = transform(copy.deepcopy(self.results)) + self.assertIsNone(results) + + # Test keep_empty = False + transform = FilterAnnotations( + min_gt_bbox_wh=(50, 50), + keep_empty=False, + ) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(isinstance(results, dict)) + + # Test filter annotations by bbox + transform = FilterAnnotations(min_gt_bbox_wh=(15, 15), by_box=True) + results = transform(copy.deepcopy(self.results)) + print((results['bbox'] == np.array([[20, 20, 40, 40], [40, 40, 80, + 80]])).all()) + self.assertTrue((results['bbox'] == np.array([[20, 20, 40, 40], + [40, 40, 80, + 80]])).all()) + self.assertTrue((results['bbox_score'] == np.array([0.8, 0.7])).all()) + self.assertTrue((results['category_id'] == np.array([2, 3])).all()) + self.assertTrue((results['keypoints'] == np.array([[25, 25, 1], + [45, 45, + 1]])).all()) + self.assertTrue( + (results['keypoints_visible'] == np.array([[1, 1, 1], [1, 1, + 1]])).all()) + self.assertTrue((results['area'] == np.array([600, 1200])).all()) + + # Test filter annotations by area + transform = FilterAnnotations(min_gt_area=1000, by_area=True) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue((results['bbox'] == np.array([[40, 40, 80, + 80]])).all()) + self.assertTrue((results['bbox_score'] == np.array([0.7])).all()) + self.assertTrue((results['category_id'] == np.array([3])).all()) + self.assertTrue((results['keypoints'] == np.array([[45, 45, + 1]])).all()) + self.assertTrue( + (results['keypoints_visible'] == np.array([[1, 1, 1]])).all()) + self.assertTrue((results['area'] == np.array([1200])).all()) + + # Test filter annotations by keypoints visibility + transform = FilterAnnotations(min_kpt_vis=3, by_kpt=True) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue((results['bbox'] == np.array([[20, 20, 40, 40], + [40, 40, 80, + 80]])).all()) + self.assertTrue((results['bbox_score'] == np.array([0.8, 0.7])).all()) + self.assertTrue((results['category_id'] == np.array([2, 3])).all()) + self.assertTrue((results['keypoints'] == np.array([[25, 25, 1], + [45, 45, + 1]])).all()) + self.assertTrue( + (results['keypoints_visible'] == np.array([[1, 1, 1], [1, 1, + 1]])).all()) + self.assertTrue((results['area'] == np.array([600, 1200])).all()) + + +class TestYOLOXHSVRandomAug(TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + img = mmcv.imread( + osp.join( + osp.dirname(__file__), '../../data/coco/000000000785.jpg'), + 'color') + self.results = { + 'img': + img, + 'img_shape': (640, 425), + 'category_id': + np.array([1, 2, 3], dtype=np.int64), + 'bbox': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + } + + def test_transform(self): + transform = YOLOXHSVRandomAug() + results = transform(copy.deepcopy(self.results)) + self.assertTrue( + results['img'].shape[:2] == self.results['img'].shape[:2]) + self.assertTrue( + results['category_id'].shape[0] == results['bbox'].shape[0]) + self.assertTrue(results['bbox'].dtype == np.float32) + + def test_repr(self): + transform = YOLOXHSVRandomAug() + self.assertEqual( + repr(transform), ('YOLOXHSVRandomAug(hue_delta=5, ' + 'saturation_delta=30, ' + 'value_delta=30)')) diff --git a/tests/test_datasets/test_transforms/test_mix_img_transform.py b/tests/test_datasets/test_transforms/test_mix_img_transform.py new file mode 100644 index 0000000000..bae26da83a --- /dev/null +++ b/tests/test_datasets/test_transforms/test_mix_img_transform.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import numpy as np + +from mmpose.datasets.transforms import Mosaic, YOLOXMixUp + + +class TestMosaic(TestCase): + + def setUp(self): + # Create a sample data dictionary for testing + sample_data = { + 'img': + np.random.randint(0, 255, size=(480, 640, 3), dtype=np.uint8), + 'bbox': np.random.rand(2, 4), + 'bbox_score': np.random.rand(2, ), + 'category_id': [1, 2], + 'keypoints': np.random.rand(2, 3, 2), + 'keypoints_visible': np.random.rand(2, 3), + 'area': np.random.rand(2, ) + } + mixed_data_list = [sample_data.copy() for _ in range(3)] + sample_data.update({'mixed_data_list': mixed_data_list}) + + self.sample_data = sample_data + + def test_apply_mix(self): + mosaic = Mosaic() + transformed_data = mosaic.apply_mix(self.sample_data) + + # Check if the transformed data has the expected keys + self.assertTrue('img' in transformed_data) + self.assertTrue('img_shape' in transformed_data) + self.assertTrue('bbox' in transformed_data) + self.assertTrue('category_id' in transformed_data) + self.assertTrue('bbox_score' in transformed_data) + self.assertTrue('keypoints' in transformed_data) + self.assertTrue('keypoints_visible' in transformed_data) + self.assertTrue('area' in transformed_data) + + def test_create_mosaic_image(self): + mosaic = Mosaic() + mosaic_img, annos = mosaic._create_mosaic_image( + self.sample_data, self.sample_data['mixed_data_list']) + + # Check if the mosaic image and annotations are generated correctly + self.assertEqual(mosaic_img.shape, (1280, 1280, 3)) + self.assertTrue('bboxes' in annos) + self.assertTrue('bbox_scores' in annos) + self.assertTrue('category_id' in annos) + self.assertTrue('keypoints' in annos) + self.assertTrue('keypoints_visible' in annos) + self.assertTrue('area' in annos) + + def test_mosaic_combine(self): + mosaic = Mosaic() + center = (320, 240) + img_shape = (480, 640) + paste_coord, crop_coord = mosaic._mosaic_combine( + 'top_left', center, img_shape) + + # Check if the coordinates are calculated correctly + self.assertEqual(paste_coord, (0, 0, 320, 240)) + self.assertEqual(crop_coord, (160, 400, 480, 640)) + + +class TestYOLOXMixUp(TestCase): + + def setUp(self): + # Create a sample data dictionary for testing + sample_data = { + 'img': + np.random.randint(0, 255, size=(480, 640, 3), dtype=np.uint8), + 'bbox': np.random.rand(2, 4), + 'bbox_score': np.random.rand(2, ), + 'category_id': [1, 2], + 'keypoints': np.random.rand(2, 3, 2), + 'keypoints_visible': np.random.rand(2, 3), + 'area': np.random.rand(2, ), + 'flip_indices': [0, 2, 1] + } + mixed_data_list = [sample_data.copy() for _ in range(1)] + sample_data.update({'mixed_data_list': mixed_data_list}) + + self.sample_data = sample_data + + def test_apply_mix(self): + mixup = YOLOXMixUp() + transformed_data = mixup.apply_mix(self.sample_data) + + # Check if the transformed data has the expected keys + self.assertTrue('img' in transformed_data) + self.assertTrue('img_shape' in transformed_data) + self.assertTrue('bbox' in transformed_data) + self.assertTrue('category_id' in transformed_data) + self.assertTrue('bbox_score' in transformed_data) + self.assertTrue('keypoints' in transformed_data) + self.assertTrue('keypoints_visible' in transformed_data) + self.assertTrue('area' in transformed_data) + + def test_create_mixup_image(self): + mixup = YOLOXMixUp() + mixup_img, annos = mixup._create_mixup_image( + self.sample_data, self.sample_data['mixed_data_list']) + + # Check if the mosaic image and annotations are generated correctly + self.assertEqual(mixup_img.shape, (480, 640, 3)) + self.assertTrue('bboxes' in annos) + self.assertTrue('bbox_scores' in annos) + self.assertTrue('category_id' in annos) + self.assertTrue('keypoints' in annos) + self.assertTrue('keypoints_visible' in annos) + self.assertTrue('area' in annos) diff --git a/tests/test_engine/test_hooks/test_mode_switch_hooks.py b/tests/test_engine/test_hooks/test_mode_switch_hooks.py new file mode 100644 index 0000000000..fbf10bd3ef --- /dev/null +++ b/tests/test_engine/test_hooks/test_mode_switch_hooks.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +import torch +from mmengine.config import Config +from mmengine.runner import Runner +from torch.utils.data import Dataset + +from mmpose.engine.hooks import YOLOXPoseModeSwitchHook +from mmpose.utils import register_all_modules + + +class DummyDataset(Dataset): + METAINFO = dict() # type: ignore + data = torch.randn(12, 2) + label = torch.ones(12) + + @property + def metainfo(self): + return self.METAINFO + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + return dict(inputs=self.data[index], data_sample=self.label[index]) + + +pipeline1 = [ + dict(type='RandomHalfBody'), +] + +pipeline2 = [ + dict(type='RandomFlip'), +] +register_all_modules() + + +class TestYOLOXPoseModeSwitchHook(TestCase): + + def test(self): + train_dataloader = dict( + dataset=DummyDataset(), + sampler=dict(type='DefaultSampler', shuffle=True), + batch_size=3, + num_workers=0) + + runner = Mock() + runner.model = Mock() + runner.model.module = Mock() + + runner.model.head.use_aux_loss = False + runner.cfg.train_dataloader = Config(train_dataloader) + runner.train_dataloader = Runner.build_dataloader(train_dataloader) + runner.train_dataloader.dataset.pipeline = pipeline1 + + hook = YOLOXPoseModeSwitchHook( + num_last_epochs=15, new_train_pipeline=pipeline2) + + # test after change mode + runner.epoch = 284 + runner.max_epochs = 300 + hook.before_train_epoch(runner) + self.assertTrue(runner.model.bbox_head.use_aux_loss) + self.assertEqual(runner.train_loop.dataloader.dataset.pipeline, + pipeline2) diff --git a/tests/test_engine/test_hooks/test_sync_norm_hook.py b/tests/test_engine/test_hooks/test_sync_norm_hook.py new file mode 100644 index 0000000000..f256127fa1 --- /dev/null +++ b/tests/test_engine/test_hooks/test_sync_norm_hook.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock, patch + +import torch.nn as nn + +from mmpose.engine.hooks import SyncNormHook + + +class TestSyncNormHook(TestCase): + + @patch( + 'mmpose.engine.hooks.sync_norm_hook.get_dist_info', + return_value=(0, 1)) + def test_before_val_epoch_non_dist(self, mock): + model = nn.Sequential( + nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3), + nn.Linear(5, 10)) + runner = Mock() + runner.model = model + hook = SyncNormHook() + hook.before_val_epoch(runner) + + @patch( + 'mmpose.engine.hooks.sync_norm_hook.get_dist_info', + return_value=(0, 2)) + def test_before_val_epoch_dist(self, mock): + model = nn.Sequential( + nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3), + nn.Linear(5, 10)) + runner = Mock() + runner.model = model + hook = SyncNormHook() + hook.before_val_epoch(runner) + + @patch( + 'mmpose.engine.hooks.sync_norm_hook.get_dist_info', + return_value=(0, 2)) + def test_before_val_epoch_dist_no_norm(self, mock): + model = nn.Sequential(nn.Conv2d(1, 5, kernel_size=3), nn.Linear(5, 10)) + runner = Mock() + runner.model = model + hook = SyncNormHook() + hook.before_val_epoch(runner) diff --git a/tests/test_engine/test_schedulers/test_quadratic_warmup.py b/tests/test_engine/test_schedulers/test_quadratic_warmup.py new file mode 100644 index 0000000000..9f0650b0c2 --- /dev/null +++ b/tests/test_engine/test_schedulers/test_quadratic_warmup.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +import torch.nn.functional as F +import torch.optim as optim +from mmengine.optim.scheduler import _ParamScheduler +from mmengine.testing import assert_allclose + +from mmpose.engine.schedulers import (QuadraticWarmupLR, + QuadraticWarmupMomentum, + QuadraticWarmupParamScheduler) + + +class ToyModel(torch.nn.Module): + + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv2(F.relu(self.conv1(x))) + + +class TestQuadraticWarmupScheduler(TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.model = ToyModel() + self.optimizer = optim.SGD( + self.model.parameters(), lr=0.05, momentum=0.01, weight_decay=5e-4) + + def _test_scheduler_value(self, + schedulers, + targets, + epochs=10, + param_name='lr'): + if isinstance(schedulers, _ParamScheduler): + schedulers = [schedulers] + for epoch in range(epochs): + for param_group, target in zip(self.optimizer.param_groups, + targets): + print(param_group[param_name]) + assert_allclose( + target[epoch], + param_group[param_name], + msg='{} is wrong in epoch {}: expected {}, got {}'.format( + param_name, epoch, target[epoch], + param_group[param_name]), + atol=1e-5, + rtol=0) + [scheduler.step() for scheduler in schedulers] + + def test_quadratic_warmup_scheduler(self): + with self.assertRaises(ValueError): + QuadraticWarmupParamScheduler(self.optimizer, param_name='lr') + epochs = 10 + iters = 5 + warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)] + single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * ( + epochs - iters) + targets = [single_targets, [x * epochs for x in single_targets]] + scheduler = QuadraticWarmupParamScheduler( + self.optimizer, param_name='lr', end=iters) + self._test_scheduler_value(scheduler, targets, epochs) + + def test_quadratic_warmup_scheduler_convert_iterbased(self): + epochs = 10 + end = 5 + epoch_length = 11 + + iters = end * epoch_length + warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)] + single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * ( + epochs * epoch_length - iters) + targets = [single_targets, [x * epochs for x in single_targets]] + scheduler = QuadraticWarmupParamScheduler.build_iter_from_epoch( + self.optimizer, + param_name='lr', + end=end, + epoch_length=epoch_length) + self._test_scheduler_value(scheduler, targets, epochs * epoch_length) + + def test_quadratic_warmup_lr(self): + epochs = 10 + iters = 5 + warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)] + single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * ( + epochs - iters) + targets = [single_targets, [x * epochs for x in single_targets]] + scheduler = QuadraticWarmupLR(self.optimizer, end=iters) + self._test_scheduler_value(scheduler, targets, epochs) + + def test_quadratic_warmup_momentum(self): + epochs = 10 + iters = 5 + warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)] + single_targets = [x * 0.01 for x in warmup_factor] + [0.01] * ( + epochs - iters) + targets = [single_targets, [x * epochs for x in single_targets]] + scheduler = QuadraticWarmupMomentum(self.optimizer, end=iters) + self._test_scheduler_value( + scheduler, targets, epochs, param_name='momentum') diff --git a/tests/test_evaluation/test_functional/test_nms.py b/tests/test_evaluation/test_functional/test_nms.py index b29ed86ccb..34a2533b76 100644 --- a/tests/test_evaluation/test_functional/test_nms.py +++ b/tests/test_evaluation/test_functional/test_nms.py @@ -2,8 +2,9 @@ from unittest import TestCase import numpy as np +import torch -from mmpose.evaluation.functional.nms import nearby_joints_nms +from mmpose.evaluation.functional.nms import nearby_joints_nms, nms_torch class TestNearbyJointsNMS(TestCase): @@ -38,3 +39,21 @@ def test_nearby_joints_nms(self): with self.assertRaises(AssertionError): _ = nearby_joints_nms(kpts_db, 0.05, num_nearby_joints_thr=3) + + +class TestNMSTorch(TestCase): + + def test_nms_torch(self): + bboxes = torch.tensor([[0, 0, 3, 3], [1, 0, 3, 3], [4, 4, 6, 6]], + dtype=torch.float32) + + scores = torch.tensor([0.9, 0.8, 0.7]) + + expected_result = torch.tensor([0, 2]) + result = nms_torch(bboxes, scores, threshold=0.5) + self.assertTrue(torch.equal(result, expected_result)) + + expected_result = [torch.tensor([0, 1]), torch.tensor([2])] + result = nms_torch(bboxes, scores, threshold=0.5, return_group=True) + for res_out, res_expected in zip(result, expected_result): + self.assertTrue(torch.equal(res_out, res_expected)) diff --git a/tests/test_models/test_backbones/test_csp_darknet.py b/tests/test_models/test_backbones/test_csp_darknet.py new file mode 100644 index 0000000000..61b200b749 --- /dev/null +++ b/tests/test_models/test_backbones/test_csp_darknet.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones.csp_darknet import CSPDarknet + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +class TestCSPDarknetBackbone(unittest.TestCase): + + def test_invalid_frozen_stages(self): + with self.assertRaises(ValueError): + CSPDarknet(frozen_stages=6) + + def test_invalid_out_indices(self): + with self.assertRaises(AssertionError): + CSPDarknet(out_indices=[6]) + + def test_frozen_stages(self): + frozen_stages = 1 + model = CSPDarknet(frozen_stages=frozen_stages) + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + self.assertFalse(param.requires_grad) + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + self.assertFalse(mod.training) + for param in layer.parameters(): + self.assertFalse(param.requires_grad) + + def test_norm_eval(self): + model = CSPDarknet(norm_eval=True) + model.train() + + self.assertFalse(check_norm_state(model.modules(), True)) + + def test_csp_darknet_p5_forward(self): + model = CSPDarknet( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + self.assertEqual(len(feat), 5) + self.assertEqual(feat[0].shape, torch.Size((1, 16, 32, 32))) + self.assertEqual(feat[1].shape, torch.Size((1, 32, 16, 16))) + self.assertEqual(feat[2].shape, torch.Size((1, 64, 8, 8))) + self.assertEqual(feat[3].shape, torch.Size((1, 128, 4, 4))) + self.assertEqual(feat[4].shape, torch.Size((1, 256, 2, 2))) + + def test_csp_darknet_p6_forward(self): + model = CSPDarknet( + arch='P6', + widen_factor=0.25, + out_indices=range(0, 6), + spp_kernal_sizes=(3, 5, 7)) + model.train() + + imgs = torch.randn(1, 3, 128, 128) + feat = model(imgs) + self.assertEqual(feat[0].shape, torch.Size((1, 16, 64, 64))) + self.assertEqual(feat[1].shape, torch.Size((1, 32, 32, 32))) + self.assertEqual(feat[2].shape, torch.Size((1, 64, 16, 16))) + self.assertEqual(feat[3].shape, torch.Size((1, 128, 8, 8))) + self.assertEqual(feat[4].shape, torch.Size((1, 192, 4, 4))) + self.assertEqual(feat[5].shape, torch.Size((1, 256, 2, 2))) + + def test_csp_darknet_custom_arch_forward(self): + arch_ovewrite = [[32, 56, 3, True, False], [56, 224, 2, True, False], + [224, 512, 1, True, False]] + model = CSPDarknet( + arch_ovewrite=arch_ovewrite, + widen_factor=0.25, + out_indices=(0, 1, 2, 3)) + model.train() + + imgs = torch.randn(1, 3, 32, 32) + feat = model(imgs) + self.assertEqual(len(feat), 4) + self.assertEqual(feat[0].shape, torch.Size((1, 8, 16, 16))) + self.assertEqual(feat[1].shape, torch.Size((1, 14, 8, 8))) + self.assertEqual(feat[2].shape, torch.Size((1, 56, 4, 4))) + self.assertEqual(feat[3].shape, torch.Size((1, 128, 2, 2))) + + def test_csp_darknet_custom_arch_norm(self): + model = CSPDarknet(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + self.assertIsInstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + self.assertEqual(len(feat), 5) + self.assertEqual(feat[0].shape, torch.Size((1, 8, 32, 32))) + self.assertEqual(feat[1].shape, torch.Size((1, 16, 16, 16))) + self.assertEqual(feat[2].shape, torch.Size((1, 32, 8, 8))) + self.assertEqual(feat[3].shape, torch.Size((1, 64, 4, 4))) + self.assertEqual(feat[4].shape, torch.Size((1, 128, 2, 2))) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_models/test_data_preprocessors/test_data_preprocessor.py b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py new file mode 100644 index 0000000000..6c669f55a2 --- /dev/null +++ b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.logging import MessageHub + +from mmpose.models.data_preprocessors import (BatchSyncRandomResize, + PoseDataPreprocessor) +from mmpose.structures import PoseDataSample + + +class TestPoseDataPreprocessor(TestCase): + + def test_init(self): + # test mean is None + processor = PoseDataPreprocessor() + self.assertTrue(not hasattr(processor, 'mean')) + self.assertTrue(processor._enable_normalize is False) + + # test mean is not None + processor = PoseDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1]) + self.assertTrue(hasattr(processor, 'mean')) + self.assertTrue(hasattr(processor, 'std')) + self.assertTrue(processor._enable_normalize) + + # please specify both mean and std + with self.assertRaises(AssertionError): + PoseDataPreprocessor(mean=[0, 0, 0]) + + # bgr2rgb and rgb2bgr cannot be set to True at the same time + with self.assertRaises(AssertionError): + PoseDataPreprocessor(bgr_to_rgb=True, rgb_to_bgr=True) + + def test_forward(self): + processor = PoseDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1]) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': [PoseDataSample()] + } + out_data = processor(data) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test channel_conversion + processor = PoseDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test padding + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 9, 14)) + ], + 'data_samples': [PoseDataSample()] * 2 + } + processor = PoseDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (2, 3, 10, 14)) + self.assertEqual(len(batch_data_samples), 2) + + # test pad_size_divisor + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 9, 24)) + ], + 'data_samples': [PoseDataSample()] * 2 + } + processor = PoseDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], pad_size_divisor=5) + out_data = processor(data) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (2, 3, 10, 25)) + self.assertEqual(len(batch_data_samples), 2) + for data_samples, expected_shape in zip(batch_data_samples, + [(10, 15), (10, 25)]): + self.assertEqual(data_samples.pad_shape, expected_shape) + + def test_batch_sync_random_resize(self): + processor = PoseDataPreprocessor(batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(320, 320), + size_divisor=32, + interval=1) + ]) + self.assertTrue( + isinstance(processor.batch_augments[0], BatchSyncRandomResize)) + message_hub = MessageHub.get_instance('test_batch_sync_random_resize') + message_hub.update_info('iter', 0) + packed_inputs = { + 'inputs': [ + torch.randint(0, 256, (3, 128, 128)), + torch.randint(0, 256, (3, 128, 128)) + ], + 'data_samples': [PoseDataSample()] * 2 + } + batch_inputs = processor(packed_inputs, training=True)['inputs'] + self.assertEqual(batch_inputs.shape, (2, 3, 128, 128)) + + # resize after one iter + message_hub.update_info('iter', 1) + packed_inputs = { + 'inputs': [ + torch.randint(0, 256, (3, 128, 128)), + torch.randint(0, 256, (3, 128, 128)) + ], + 'data_samples': + [PoseDataSample(metainfo=dict(img_shape=(128, 128)))] * 2 + } + batch_inputs = processor(packed_inputs, training=True)['inputs'] + self.assertEqual(batch_inputs.shape, (2, 3, 320, 320)) + + packed_inputs = { + 'inputs': [ + torch.randint(0, 256, (3, 128, 128)), + torch.randint(0, 256, (3, 128, 128)) + ], + 'data_samples': [PoseDataSample()] * 2 + } + batch_inputs = processor(packed_inputs, training=False)['inputs'] + self.assertEqual(batch_inputs.shape, (2, 3, 128, 128)) diff --git a/tests/test_models/test_necks/test_yolox_pafpn.py b/tests/test_models/test_necks/test_yolox_pafpn.py new file mode 100644 index 0000000000..89eae39a6c --- /dev/null +++ b/tests/test_models/test_necks/test_yolox_pafpn.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.models.necks import YOLOXPAFPN + + +class TestYOLOXPAFPN(TestCase): + + def test_forward(self): + in_channels = [128, 256, 512] + out_channels = 256 + num_csp_blocks = 3 + + model = YOLOXPAFPN( + in_channels=in_channels, + out_channels=out_channels, + num_csp_blocks=num_csp_blocks) + model.train() + + inputs = [ + torch.randn(1, c, 64 // (2**i), 64 // (2**i)) + for i, c in enumerate(in_channels) + ] + outputs = model(inputs) + + self.assertEqual(len(outputs), len(in_channels)) + for out in outputs: + self.assertEqual(out.shape[1], out_channels) diff --git a/tests/test_structures/test_bbox/test_bbox_overlaps.py b/tests/test_structures/test_bbox/test_bbox_overlaps.py new file mode 100644 index 0000000000..b3523c8af5 --- /dev/null +++ b/tests/test_structures/test_bbox/test_bbox_overlaps.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.structures.bbox import bbox_overlaps # Import your function here + + +class TestBBoxOverlaps(TestCase): + + def test_bbox_overlaps_iou(self): + bboxes1 = torch.FloatTensor([ + [0, 0, 10, 10], + [10, 10, 20, 20], + [32, 32, 38, 42], + ]) + bboxes2 = torch.FloatTensor([ + [0, 0, 10, 20], + [0, 10, 10, 19], + [10, 10, 20, 20], + ]) + overlaps = bbox_overlaps(bboxes1, bboxes2) + + expected_overlaps = torch.FloatTensor([ + [0.5000, 0.0000, 0.0000], + [0.0000, 0.0000, 1.0000], + [0.0000, 0.0000, 0.0000], + ]) + + self.assertTrue( + torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4)) + + def test_bbox_overlaps_iof(self): + bboxes1 = torch.FloatTensor([ + [0, 0, 10, 10], + [10, 10, 20, 20], + [32, 32, 38, 42], + ]) + bboxes2 = torch.FloatTensor([ + [0, 0, 10, 20], + [0, 10, 10, 19], + [10, 10, 20, 20], + ]) + overlaps = bbox_overlaps(bboxes1, bboxes2, mode='iof') + + expected_overlaps = torch.FloatTensor([ + [1., 0., 0.], + [0., 0., 1.], + [0., 0., 0.], + ]) + + self.assertTrue( + torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4)) + + def test_bbox_overlaps_giou(self): + bboxes1 = torch.FloatTensor([ + [0, 0, 10, 10], + [10, 10, 20, 20], + [32, 32, 38, 42], + ]) + bboxes2 = torch.FloatTensor([ + [0, 0, 10, 20], + [0, 10, 10, 19], + [10, 10, 20, 20], + ]) + overlaps = bbox_overlaps(bboxes1, bboxes2, mode='giou') + + expected_overlaps = torch.FloatTensor([ + [0.5000, 0.0000, -0.5000], + [-0.2500, -0.0500, 1.0000], + [-0.8371, -0.8766, -0.8214], + ]) + + self.assertTrue( + torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4)) diff --git a/tests/test_structures/test_bbox/test_bbox_transforms.py b/tests/test_structures/test_bbox/test_bbox_transforms.py new file mode 100644 index 0000000000..b2eb3da683 --- /dev/null +++ b/tests/test_structures/test_bbox/test_bbox_transforms.py @@ -0,0 +1,126 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.structures.bbox import (bbox_clip_border, bbox_corner2xyxy, + bbox_xyxy2corner, get_pers_warp_matrix) + + +class TestBBoxClipBorder(TestCase): + + def test_bbox_clip_border_2D(self): + bbox = np.array([[10, 20], [60, 80], [-5, 25], [100, 120]]) + shape = (50, 50) # Example image shape + clipped_bbox = bbox_clip_border(bbox, shape) + + expected_bbox = np.array([[10, 20], [50, 50], [0, 25], [50, 50]]) + + self.assertTrue(np.array_equal(clipped_bbox, expected_bbox)) + + def test_bbox_clip_border_4D(self): + bbox = np.array([ + [[10, 20, 30, 40], [40, 50, 80, 90]], + [[-5, 0, 30, 40], [70, 80, 120, 130]], + ]) + shape = (50, 60) # Example image shape + clipped_bbox = bbox_clip_border(bbox, shape) + + expected_bbox = np.array([ + [[10, 20, 30, 40], [40, 50, 50, 60]], + [[0, 0, 30, 40], [50, 60, 50, 60]], + ]) + + self.assertTrue(np.array_equal(clipped_bbox, expected_bbox)) + + +class TestBBoxXYXY2Corner(TestCase): + + def test_bbox_xyxy2corner_single(self): + bbox = np.array([0, 0, 100, 50]) + corners = bbox_xyxy2corner(bbox) + + expected_corners = np.array([[0, 0], [0, 50], [100, 0], [100, 50]]) + + self.assertTrue(np.array_equal(corners, expected_corners)) + + def test_bbox_xyxy2corner_multiple(self): + bboxes = np.array([[0, 0, 100, 50], [10, 20, 200, 150]]) + corners = bbox_xyxy2corner(bboxes) + + expected_corners = np.array([[[0, 0], [0, 50], [100, 0], [100, 50]], + [[10, 20], [10, 150], [200, 20], + [200, 150]]]) + + self.assertTrue(np.array_equal(corners, expected_corners)) + + +class TestBBoxCorner2XYXY(TestCase): + + def test_bbox_corner2xyxy_single(self): + + corners = np.array([[0, 0], [0, 50], [100, 0], [100, 50]]) + xyxy = bbox_corner2xyxy(corners) + expected_xyxy = np.array([0, 0, 100, 50]) + + self.assertTrue(np.array_equal(xyxy, expected_xyxy)) + + def test_bbox_corner2xyxy_multiple(self): + + corners = np.array([[[0, 0], [0, 50], [100, 0], [100, 50]], + [[10, 20], [10, 150], [200, 20], [200, 150]]]) + xyxy = bbox_corner2xyxy(corners) + expected_xyxy = np.array([[0, 0, 100, 50], [10, 20, 200, 150]]) + + self.assertTrue(np.array_equal(xyxy, expected_xyxy)) + + +class TestGetPersWarpMatrix(TestCase): + + def test_get_pers_warp_matrix_identity(self): + center = np.array([0, 0]) + translate = np.array([0, 0]) + scale = 1.0 + rot = 0.0 + shear = np.array([0.0, 0.0]) + warp_matrix = get_pers_warp_matrix(center, translate, scale, rot, + shear) + + expected_matrix = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], + dtype=np.float32) + + self.assertTrue(np.array_equal(warp_matrix, expected_matrix)) + + def test_get_pers_warp_matrix_translation(self): + center = np.array([0, 0]) + translate = np.array([10, 20]) + scale = 1.0 + rot = 0.0 + shear = np.array([0.0, 0.0]) + warp_matrix = get_pers_warp_matrix(center, translate, scale, rot, + shear) + + expected_matrix = np.array([[1, 0, 10], [0, 1, 20], [0, 0, 1]], + dtype=np.float32) + + self.assertTrue(np.array_equal(warp_matrix, expected_matrix)) + + def test_get_pers_warp_matrix_scale_rotation_shear(self): + center = np.array([0, 0]) + translate = np.array([0, 0]) + scale = 1.5 + rot = 45.0 + shear = np.array([15.0, 30.0]) + warp_matrix = get_pers_warp_matrix(center, translate, scale, rot, + shear) + + expected_matrix = np.array([ + [1.3448632, -0.77645713, 0.], + [1.6730325, 0.44828773, 0.], + [0., 0., 1.], + ], + dtype=np.float32) + + # Use np.allclose to compare floating-point arrays within a tolerance + self.assertTrue( + np.allclose(warp_matrix, expected_matrix, rtol=1e-3, atol=1e-3)) diff --git a/tests/test_structures/test_keypoint/test_keypoint_transforms.py b/tests/test_structures/test_keypoint/test_keypoint_transforms.py new file mode 100644 index 0000000000..5384ce2b14 --- /dev/null +++ b/tests/test_structures/test_keypoint/test_keypoint_transforms.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.structures import keypoint_clip_border + + +class TestKeypointClipBorder(TestCase): + + def test_keypoint_clip_border(self): + keypoints = np.array([[[10, 20], [30, 40], [-5, 25], [50, 60]]]) + keypoints_visible = np.array([[1.0, 0.8, 0.5, 1.0]]) + shape = (50, 50) # Example frame shape + + clipped_keypoints, clipped_keypoints_visible = keypoint_clip_border( + keypoints, keypoints_visible, shape) + + # Check if keypoints outside the frame have visibility set to 0.0 + self.assertEqual(clipped_keypoints_visible[0, 2], 0.0) + self.assertEqual(clipped_keypoints_visible[0, 3], 0.0) + + # Check if keypoints inside the frame have unchanged visibility values + self.assertEqual(clipped_keypoints_visible[0, 0], 1.0) + self.assertEqual(clipped_keypoints_visible[0, 1], 0.8) + + # Check if keypoints array shapes remain unchanged + self.assertEqual(keypoints.shape, clipped_keypoints.shape) + self.assertEqual(keypoints_visible.shape, + clipped_keypoints_visible.shape) + + keypoints = np.array([[[10, 20], [30, 40], [-5, 25], [50, 60]]]) + keypoints_visible = np.array([[1.0, 0.8, 0.5, 1.0]]) + keypoints_visible_weight = np.array([[1.0, 0.0, 1.0, 1.0]]) + keypoints_visible = np.stack( + (keypoints_visible, keypoints_visible_weight), axis=-1) + shape = (50, 50) # Example frame shape + + clipped_keypoints, clipped_keypoints_visible = keypoint_clip_border( + keypoints, keypoints_visible, shape) + + # Check if keypoints array shapes remain unchanged + self.assertEqual(keypoints.shape, clipped_keypoints.shape) + self.assertEqual(keypoints_visible.shape, + clipped_keypoints_visible.shape) + + # Check if keypoints outside the frame have visibility set to 0.0 + self.assertEqual(clipped_keypoints_visible[0, 2, 0], 0.0) + self.assertEqual(clipped_keypoints_visible[0, 3, 0], 0.0) + + # Check if keypoints inside the frame have unchanged visibility values + self.assertEqual(clipped_keypoints_visible[0, 0, 0], 1.0) + self.assertEqual(clipped_keypoints_visible[0, 1, 0], 0.8) + + # Check if the visibility weights remain unchanged + self.assertSequenceEqual(clipped_keypoints_visible[..., 1].tolist(), + keypoints_visible[..., 1].tolist()) From ab39ce7505b0cc74cc76355b766987d460c07e73 Mon Sep 17 00:00:00 2001 From: Peng Lu Date: Thu, 14 Sep 2023 17:44:54 +0800 Subject: [PATCH 3/4] [Fix] Fix typo in COCOMetric(#2691) --- mmpose/evaluation/metrics/coco_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmpose/evaluation/metrics/coco_metric.py b/mmpose/evaluation/metrics/coco_metric.py index d1c7191338..8fc32dd809 100644 --- a/mmpose/evaluation/metrics/coco_metric.py +++ b/mmpose/evaluation/metrics/coco_metric.py @@ -526,7 +526,7 @@ def results2json(self, keypoints: Dict[int, list], 'score': float(img_kpt['score']), } if 'bbox' in img_kpt: - res['bbox'] = img_kpt['bbox'].tolist(), + res['bbox'] = img_kpt['bbox'].tolist() result.append(res) cat_results.extend(result) From 5530c3b3a1f044bacd4d403a9b7cccf7729539b0 Mon Sep 17 00:00:00 2001 From: Peng Lu Date: Fri, 15 Sep 2023 16:27:58 +0800 Subject: [PATCH 4/4] [Fix] Fix bug raised by changing bbox_center to input_center (#2693) --- mmpose/structures/utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mmpose/structures/utils.py b/mmpose/structures/utils.py index 882cda8603..616b139c54 100644 --- a/mmpose/structures/utils.py +++ b/mmpose/structures/utils.py @@ -50,8 +50,7 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample: 0].pred_fields: reverted_heatmaps = [ revert_heatmap(data_sample.pred_fields.heatmaps, - data_sample.gt_instances.bbox_centers, - data_sample.gt_instances.bbox_scales, + data_sample.input_center, data_sample.input_scale, data_sample.ori_shape) for data_sample in data_samples ] @@ -65,8 +64,7 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample: 0].gt_fields: reverted_heatmaps = [ revert_heatmap(data_sample.gt_fields.heatmaps, - data_sample.gt_instances.bbox_centers, - data_sample.gt_instances.bbox_scales, + data_sample.input_center, data_sample.input_scale, data_sample.ori_shape) for data_sample in data_samples ] @@ -79,13 +77,13 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample: return merged -def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape): +def revert_heatmap(heatmap, input_center, input_scale, img_shape): """Revert predicted heatmap on the original image. Args: heatmap (np.ndarray or torch.tensor): predicted heatmap. - bbox_center (np.ndarray): bounding box center coordinate. - bbox_scale (np.ndarray): bounding box scale. + input_center (np.ndarray): bounding box center coordinate. + input_scale (np.ndarray): bounding box scale. img_shape (tuple or list): size of original image. """ if torch.is_tensor(heatmap): @@ -99,8 +97,8 @@ def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape): hm_h, hm_w = heatmap.shape[:2] img_h, img_w = img_shape warp_mat = get_warp_matrix( - bbox_center.reshape((2, )), - bbox_scale.reshape((2, )), + input_center.reshape((2, )), + input_scale.reshape((2, )), rot=0, output_size=(hm_w, hm_h), inv=True)