From 9742491d926ca173fa2c3ffce06b0cc671e9d1ab Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Wed, 13 Sep 2023 19:06:07 +0800
Subject: [PATCH 1/4] [Fix] fix vitpose pretrained ckpts (#2687)

---
 .../coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py   | 2 +-
 .../coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py          | 2 +-
 .../coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py   | 2 +-
 .../coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py          | 2 +-
 .../coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py  | 2 +-
 .../coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py         | 2 +-
 .../coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py  | 2 +-
 .../coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py         | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
index 9732371787..5a55780505 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_base.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'),
     ),
     neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
     head=dict(
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
index fc08c61dff..06522b7b91 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_base.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'),
     ),
     head=dict(
         type='HeatmapHead',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
index 7d94f97c1b..03ae669807 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_huge.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'),
     ),
     neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
     head=dict(
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
index 4aa2c21c1f..6b8afcf0f4 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_huge.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'),
     ),
     head=dict(
         type='HeatmapHead',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
index cf875d5167..2035e786df 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_large.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'),
     ),
     neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
     head=dict(
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
index 5ba6eafb4b..f1d0e90578 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
@@ -71,7 +71,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_large.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'),
     ),
     head=dict(
         type='HeatmapHead',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
index 88bd3e43e3..d8216089b7 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
@@ -76,7 +76,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_small.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'),
     ),
     neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
     head=dict(
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
index 791f9b5945..5b77da96eb 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
@@ -76,7 +76,7 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint='https://download.openmmlab.com/mmpose/'
-            'v1/pretrained_models/mae_pretrain_vit_small.pth'),
+            'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'),
     ),
     head=dict(
         type='HeatmapHead',

From 7bea17cdf362cdd731fc5e79b982e4b79be5cca9 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Thu, 14 Sep 2023 14:05:03 +0800
Subject: [PATCH 2/4] [Refactor] Refactor YOLOX-Pose into mmpose core package
 (#2620)

---
 configs/body_2d_keypoint/yoloxpose/README.md  |  22 +
 .../yoloxpose/coco/yoloxpose_coco.md          |  59 ++
 .../yoloxpose/coco/yoloxpose_coco.yml         |  72 ++
 .../coco/yoloxpose_l_8xb32-300e_coco-640.py   |  17 +
 .../coco/yoloxpose_m_8xb32-300e_coco-640.py   |  16 +
 .../coco/yoloxpose_s_8xb32-300e_coco-640.py   | 266 +++++++
 .../yoloxpose_tiny_4xb64-300e_coco-416.py     |  77 ++
 docs/src/papers/algorithms/yolopose.md        |  30 +
 mmpose/codecs/__init__.py                     |   3 +-
 mmpose/codecs/annotation_processors.py        |  92 +++
 mmpose/datasets/dataset_wrappers.py           |   8 +
 .../datasets/base/base_coco_style_dataset.py  |  27 +-
 .../datasets/datasets/body/jhmdb_dataset.py   |   4 +
 mmpose/datasets/datasets/body/mpii_dataset.py |   8 +
 mmpose/datasets/transforms/__init__.py        |  12 +-
 .../transforms/bottomup_transforms.py         | 184 +++--
 .../datasets/transforms/common_transforms.py  | 177 +++++
 .../datasets/transforms/mix_img_transforms.py | 501 ++++++++++++
 mmpose/engine/__init__.py                     |   1 +
 mmpose/engine/hooks/__init__.py               |   7 +-
 mmpose/engine/hooks/mode_switch_hooks.py      |  65 ++
 mmpose/engine/hooks/sync_norm_hook.py         |  41 +
 mmpose/engine/schedulers/__init__.py          |   8 +
 mmpose/engine/schedulers/quadratic_warmup.py  | 131 +++
 mmpose/evaluation/functional/__init__.py      |   5 +-
 mmpose/evaluation/functional/nms.py           |  41 +
 mmpose/evaluation/metrics/coco_metric.py      |  24 +-
 mmpose/models/backbones/__init__.py           |   5 +-
 mmpose/models/backbones/csp_darknet.py        | 286 +++++++
 mmpose/models/backbones/cspnext.py            | 195 +++++
 mmpose/models/data_preprocessors/__init__.py  |   3 +-
 .../data_preprocessors/batch_augmentation.py  | 115 +++
 .../data_preprocessors/data_preprocessor.py   |  90 +++
 mmpose/models/heads/hybrid_heads/__init__.py  |   3 +-
 .../heads/hybrid_heads/yoloxpose_head.py      | 752 ++++++++++++++++++
 mmpose/models/losses/__init__.py              |   7 +-
 mmpose/models/losses/bbox_loss.py             |  72 ++
 mmpose/models/losses/classification_loss.py   |  22 +-
 mmpose/models/losses/regression_loss.py       | 123 ++-
 mmpose/models/necks/__init__.py               |   5 +-
 mmpose/models/necks/channel_mapper.py         | 106 +++
 mmpose/models/necks/yolox_pafpn.py            | 156 ++++
 mmpose/models/pose_estimators/base.py         |  15 +-
 mmpose/models/pose_estimators/bottomup.py     |  13 +-
 mmpose/models/task_modules/__init__.py        |   3 +
 .../models/task_modules/assigners/__init__.py |   5 +
 .../assigners/metric_calculators.py           | 108 +++
 .../assigners/sim_ota_assigner.py             | 284 +++++++
 .../task_modules/prior_generators/__init__.py |   2 +
 .../prior_generators/mlvl_point_generator.py  | 245 ++++++
 mmpose/models/utils/__init__.py               |   4 +-
 mmpose/models/utils/csp_layer.py              | 273 +++++++
 mmpose/models/utils/misc.py                   |  76 ++
 mmpose/registry.py                            |   6 +-
 mmpose/structures/__init__.py                 |  13 +-
 mmpose/structures/bbox/__init__.py            |  12 +-
 mmpose/structures/bbox/bbox_overlaps.py       | 117 +++
 mmpose/structures/bbox/transforms.py          | 157 +++-
 mmpose/structures/keypoint/__init__.py        |   7 +-
 mmpose/structures/keypoint/transforms.py      |  30 +
 mmpose/utils/__init__.py                      |   3 +-
 mmpose/utils/dist_utils.py                    |  11 +
 mmpose/utils/tensor_utils.py                  |   3 +
 model-index.yml                               |   1 +
 projects/yolox_pose/README.md                 |  10 +-
 projects/yolox_pose/datasets/__init__.py      |  10 +
 projects/yolox_pose/models/__init__.py        |  10 +
 .../test_codecs/test_annotation_processors.py |  35 +
 .../test_body_datasets/test_aic_dataset.py    |   1 +
 .../test_body_datasets/test_coco_dataset.py   |   1 +
 .../test_crowdpose_dataset.py                 |   1 +
 .../test_body_datasets/test_jhmdb_dataset.py  |   1 +
 .../test_body_datasets/test_mhp_dataset.py    |   1 +
 .../test_body_datasets/test_mpii_dataset.py   |   1 +
 .../test_posetrack18_dataset.py               |   1 +
 .../test_transforms/test_common_transforms.py | 139 +++-
 .../test_transforms/test_mix_img_transform.py | 115 +++
 .../test_hooks/test_mode_switch_hooks.py      |  67 ++
 .../test_hooks/test_sync_norm_hook.py         |  44 +
 .../test_schedulers/test_quadratic_warmup.py  | 108 +++
 .../test_functional/test_nms.py               |  21 +-
 .../test_backbones/test_csp_darknet.py        | 125 +++
 .../test_data_preprocessor.py                 | 135 ++++
 .../test_necks/test_yolox_pafpn.py            |  30 +
 .../test_bbox/test_bbox_overlaps.py           |  75 ++
 .../test_bbox/test_bbox_transforms.py         | 126 +++
 .../test_keypoint/test_keypoint_transforms.py |  57 ++
 87 files changed, 6236 insertions(+), 104 deletions(-)
 create mode 100644 configs/body_2d_keypoint/yoloxpose/README.md
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
 create mode 100644 configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
 create mode 100644 docs/src/papers/algorithms/yolopose.md
 create mode 100644 mmpose/codecs/annotation_processors.py
 create mode 100644 mmpose/datasets/transforms/mix_img_transforms.py
 create mode 100644 mmpose/engine/hooks/mode_switch_hooks.py
 create mode 100644 mmpose/engine/hooks/sync_norm_hook.py
 create mode 100644 mmpose/engine/schedulers/__init__.py
 create mode 100644 mmpose/engine/schedulers/quadratic_warmup.py
 create mode 100644 mmpose/models/backbones/csp_darknet.py
 create mode 100644 mmpose/models/backbones/cspnext.py
 create mode 100644 mmpose/models/data_preprocessors/batch_augmentation.py
 create mode 100644 mmpose/models/heads/hybrid_heads/yoloxpose_head.py
 create mode 100644 mmpose/models/losses/bbox_loss.py
 create mode 100644 mmpose/models/necks/channel_mapper.py
 create mode 100644 mmpose/models/necks/yolox_pafpn.py
 create mode 100644 mmpose/models/task_modules/__init__.py
 create mode 100644 mmpose/models/task_modules/assigners/__init__.py
 create mode 100644 mmpose/models/task_modules/assigners/metric_calculators.py
 create mode 100644 mmpose/models/task_modules/assigners/sim_ota_assigner.py
 create mode 100644 mmpose/models/task_modules/prior_generators/__init__.py
 create mode 100644 mmpose/models/task_modules/prior_generators/mlvl_point_generator.py
 create mode 100644 mmpose/models/utils/csp_layer.py
 create mode 100644 mmpose/models/utils/misc.py
 create mode 100644 mmpose/structures/bbox/bbox_overlaps.py
 create mode 100644 mmpose/utils/dist_utils.py
 create mode 100644 tests/test_codecs/test_annotation_processors.py
 create mode 100644 tests/test_datasets/test_transforms/test_mix_img_transform.py
 create mode 100644 tests/test_engine/test_hooks/test_mode_switch_hooks.py
 create mode 100644 tests/test_engine/test_hooks/test_sync_norm_hook.py
 create mode 100644 tests/test_engine/test_schedulers/test_quadratic_warmup.py
 create mode 100644 tests/test_models/test_backbones/test_csp_darknet.py
 create mode 100644 tests/test_models/test_data_preprocessors/test_data_preprocessor.py
 create mode 100644 tests/test_models/test_necks/test_yolox_pafpn.py
 create mode 100644 tests/test_structures/test_bbox/test_bbox_overlaps.py
 create mode 100644 tests/test_structures/test_bbox/test_bbox_transforms.py
 create mode 100644 tests/test_structures/test_keypoint/test_keypoint_transforms.py

diff --git a/configs/body_2d_keypoint/yoloxpose/README.md b/configs/body_2d_keypoint/yoloxpose/README.md
new file mode 100644
index 0000000000..8195b1e236
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/README.md
@@ -0,0 +1,22 @@
+# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2204.06806">YOLO-Pose (CVPRW'2022)</a></summary>
+
+```bibtex
+@inproceedings{maji2022yolo,
+  title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
+  author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2637--2646},
+  year={2022}
+}
+```
+
+</details>
+
+YOLO-Pose is a bottom-up pose estimation approach that simultaneously detects all person instances and regresses keypoint locations in a single pass.
+
+We implement **YOLOX-Pose** based on the **YOLOX** object detection framework and inherits the benefits of unified pose estimation and object detection from YOLO-pose. To predict keypoint locations more accurately, separate branches with adaptive convolutions are used to regress the offsets for different joints. This allows optimizing the feature extraction for each keypoint.
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md
new file mode 100644
index 0000000000..264673d53d
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2204.06806">YOLO-Pose (CVPRW'2022)</a></summary>
+
+```bibtex
+@inproceedings{maji2022yolo,
+  title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
+  author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2637--2646},
+  year={2022}
+}
+```
+
+</details>
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2107.08430">YOLOX</a></summary>
+
+```bibtex
+@article{ge2021yolox,
+  title={Yolox: Exceeding yolo series in 2021},
+  author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
+  journal={arXiv preprint arXiv:2107.08430},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
+
+```bibtex
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch                                          | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |                     ckpt                      |                      log                      |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [yoloxpose_tiny](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py) |  416x416   | 0.527 |      0.794      |      0.557      | 0.577 |      0.843      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-20230829.json) |
+| [yoloxpose_s](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py) |  640x640   | 0.642 |      0.873      |      0.702      | 0.688 |      0.912      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-20230829.json) |
+| [yoloxpose_m](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py) |  640x640   | 0.697 |      0.903      |      0.766      | 0.739 |      0.933      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-20230829.json) |
+| [yoloxpose_l](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py) |  640x640   | 0.714 |      0.906      |      0.785      | 0.756 |      0.934      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-20230829.json) |
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
new file mode 100644
index 0000000000..cd745f39a2
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: YOLOXPose
+  Paper:
+    Title: 'YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss'
+    URL: https://arxiv.org/abs/2204.06806
+  README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/yolopose.md
+Models:
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
+  In Collection: YOLOXPose
+  Metadata:
+    Architecture: &id001
+    - YOLOXPose
+    Training Data: COCO
+  Name: yoloxpose_tiny_4xb64-300e_coco-416
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.527
+      AP@0.5: 0.794
+      AP@0.75: 0.557
+      AR: 0.577
+      AR@0.5: 0.843
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
+  In Collection: YOLOXPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: yoloxpose_s_8xb32-300e_coco-640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.642
+      AP@0.5: 0.873
+      AP@0.75: 0.702
+      AR: 0.688
+      AR@0.5: 0.912
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
+  In Collection: YOLOXPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: yoloxpose_m_8xb32-300e_coco-640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.697
+      AP@0.5: 0.903
+      AP@0.75: 0.766
+      AR: 0.739
+      AR@0.5: 0.933
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
+  In Collection: YOLOXPose
+  Metadata:
+    Architecture: *id001
+    Training Data: COCO
+  Name: yoloxpose_l_8xb32-300e_coco-640
+  Results:
+  - Dataset: COCO
+    Metrics:
+      AP: 0.714
+      AP@0.5: 0.906
+      AP@0.75: 0.785
+      AR: 0.756
+      AR@0.5: 0.934
+    Task: Body 2D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
new file mode 100644
index 0000000000..95a012bd6b
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
@@ -0,0 +1,17 @@
+_base_ = './yolopose_s_8xb32-300e_coco-640.py'
+
+widen_factor = 1
+deepen_factor = 1
+checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \
+    'l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth'
+
+# model settings
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        init_cfg=dict(checkpoint=checkpoint),
+    ),
+    neck=dict(
+        in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3),
+    head=dict(head_module_cfg=dict(widen_factor=widen_factor)))
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
new file mode 100644
index 0000000000..06eb0322e4
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
@@ -0,0 +1,16 @@
+_base_ = './yolopose_s_8xb32-300e_coco-640.py'
+
+widen_factor = 0.75
+deepen_factor = 0.67
+checkpoint = 'https://download.openmmlab.com/mmpose/v1/pretrained_models/' \
+             'yolox_m_8x8_300e_coco_20230829.pth'
+
+# model settings
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        init_cfg=dict(checkpoint=checkpoint),
+    ),
+    neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+    head=dict(head_module_cfg=dict(widen_factor=widen_factor)))
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
new file mode 100644
index 0000000000..635d243397
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
@@ -0,0 +1,266 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(
+    _delete_=True,
+    type='EpochBasedTrainLoop',
+    max_epochs=300,
+    val_interval=10,
+    dynamic_intervals=[(280, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0,
+        bias_decay_mult=0,
+        bypass_duplicate=True,
+    ),
+    clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=5,
+        T_max=280,
+        end=280,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(type='ConstantLR', by_epoch=True, factor=1, begin=280, end=300),
+]
+
+# model
+widen_factor = 0.5
+deepen_factor = 0.33
+
+model = dict(
+    type='BottomupPoseEstimator',
+    init_cfg=dict(
+        type='Kaiming',
+        layer='Conv2d',
+        a=2.23606797749979,
+        distribution='uniform',
+        mode='fan_in',
+        nonlinearity='leaky_relu'),
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        pad_size_divisor=32,
+        mean=[0, 0, 0],
+        std=[1, 1, 1],
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=1),
+        ]),
+    backbone=dict(
+        type='CSPDarknet',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        out_indices=(2, 3, 4),
+        spp_kernal_sizes=(5, 9, 13),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+            'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
+            '20211121_095711-4592a793.pth',
+            prefix='backbone.',
+        )),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[128, 256, 512],
+        out_channels=128,
+        num_csp_blocks=1,
+        use_depthwise=False,
+        upsample_cfg=dict(scale_factor=2, mode='nearest'),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish')),
+    head=dict(
+        type='YOLOXPoseHead',
+        num_keypoints=17,
+        featmap_strides=(8, 16, 32),
+        head_module_cfg=dict(
+            num_classes=1,
+            in_channels=256,
+            feat_channels=256,
+            widen_factor=widen_factor,
+            stacked_convs=2,
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='Swish')),
+        prior_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+        assigner=dict(type='SimOTAAssigner', dynamic_k_indicator='oks'),
+        overlaps_power=0.5,
+        loss_cls=dict(type='BCELoss', reduction='sum', loss_weight=1.0),
+        loss_bbox=dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj=dict(
+            type='BCELoss',
+            use_target_weight=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_oks=dict(
+            type='OKSLoss',
+            reduction='none',
+            metainfo='configs/_base_/datasets/coco.py',
+            norm_target_weight=True,
+            loss_weight=30.0),
+        loss_vis=dict(
+            type='BCELoss',
+            use_target_weight=True,
+            reduction='mean',
+            loss_weight=1.0),
+        loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+    ),
+    test_cfg=dict(
+        score_thr=0.001,
+        nms_thr=0.65,
+    ))
+
+# data
+input_size = (640, 640)
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+    dict(type='LoadImage', backend_args=None),
+    dict(
+        type='Mosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        pre_transform=[dict(type='LoadImage', backend_args=None)]),
+    dict(
+        type='BottomupRandomAffine',
+        input_size=(640, 640),
+        shift_factor=0.1,
+        rotate_factor=10,
+        scale_factor=(0.75, 1.0),
+        pad_val=114,
+        distribution='uniform',
+        transform_mode='perspective',
+        bbox_keep_corner=False,
+        clip_border=True,
+    ),
+    dict(
+        type='YOLOXMixUp',
+        img_scale=(640, 640),
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        pre_transform=[dict(type='LoadImage', backend_args=None)]),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip'),
+    dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage'),
+    dict(
+        type='BottomupRandomAffine',
+        input_size=(640, 640),
+        shift_prob=0,
+        rotate_prob=0,
+        scale_prob=0,
+        scale_type='long',
+        pad_val=(114, 114, 114),
+        bbox_keep_corner=False,
+        clip_border=True,
+    ),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip'),
+    dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+dataset_coco = dict(
+    type='CocoDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    filter_cfg=dict(filter_empty_gt=False, min_size=32),
+    ann_file='coco/annotations/person_keypoints_train2017.json',
+    data_prefix=dict(img='coco/train2017/'),
+    pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dataset_coco)
+
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(
+        type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='coco/annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='coco/val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+    score_mode='bbox',
+    nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(
+        type='YOLOXPoseModeSwitchHook',
+        num_last_epochs=20,
+        new_train_pipeline=train_pipeline_stage2,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        strict_load=False,
+        priority=49),
+]
diff --git a/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
new file mode 100644
index 0000000000..f918e8b16f
--- /dev/null
+++ b/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
@@ -0,0 +1,77 @@
+_base_ = './yolopose_s_8xb32-300e_coco-640.py'
+
+# model settings
+widen_factor = 0.375
+deepen_factor = 0.33
+checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \
+    'tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth'
+
+model = dict(
+    data_preprocessor=dict(batch_augments=[
+        dict(
+            type='BatchSyncRandomResize',
+            random_size_range=(320, 640),
+            size_divisor=32,
+            interval=1),
+    ]),
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        init_cfg=dict(checkpoint=checkpoint),
+    ),
+    neck=dict(
+        in_channels=[96, 192, 384],
+        out_channels=96,
+    ),
+    head=dict(head_module_cfg=dict(widen_factor=widen_factor), ))
+
+# dataset settings
+train_pipeline_stage1 = [
+    dict(type='LoadImage', backend_args=None),
+    dict(
+        type='Mosaic',
+        img_scale=_base_.input_size,
+        pad_val=114.0,
+        pre_transform=[dict(type='LoadImage', backend_args=None)]),
+    dict(
+        type='BottomupRandomAffine',
+        input_size=_base_.input_size,
+        shift_factor=0.1,
+        rotate_factor=10,
+        scale_factor=(0.75, 1.0),
+        pad_val=114,
+        distribution='uniform',
+        transform_mode='perspective',
+        bbox_keep_corner=False,
+        clip_border=True,
+    ),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip'),
+    dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+    dict(type='GenerateTarget', encoder=_base_.codec),
+    dict(
+        type='PackPoseInputs',
+        extra_mapping_labels={
+            'bbox': 'bboxes',
+            'bbox_labels': 'labels',
+            'keypoints': 'keypoints',
+            'keypoints_visible': 'keypoints_visible',
+            'area': 'areas'
+        }),
+]
+train_dataloader = dict(
+    batch_size=64, dataset=dict(pipeline=train_pipeline_stage1))
+
+input_size = (416, 416)
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(
+        type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(dataset=dict(pipeline=val_pipeline, ))
+test_dataloader = val_dataloader
diff --git a/docs/src/papers/algorithms/yolopose.md b/docs/src/papers/algorithms/yolopose.md
new file mode 100644
index 0000000000..fe1f41a804
--- /dev/null
+++ b/docs/src/papers/algorithms/yolopose.md
@@ -0,0 +1,30 @@
+# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss
+
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2204.06806">YOLO-Pose (CVPRW'2022)</a></summary>
+
+```bibtex
+@inproceedings{maji2022yolo,
+  title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
+  author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2637--2646},
+  year={2022}
+}
+```
+
+</details>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We introduce YOLO-pose, a novel heatmap-free approach for joint detection, and 2D multi-person pose estimation in an image based on the popular YOLO object detection framework. Existing heatmap based two-stage approaches are sub-optimal as they are not end-to-end trainable and training relies on a surrogate L1 loss that is not equivalent to maximizing the evaluation metric, i.e. Object Keypoint Similarity (OKS). Our framework allows us to train the model end-to-end and optimize the OKS metric itself. The proposed model learns to jointly detect bounding boxes for multiple persons and their corresponding 2D poses in a single forward pass and thus bringing in the best of both top-down and bottom-up approaches. Proposed approach doesn't require the postprocessing of bottom-up approaches to group detected keypoints into a skeleton as each bounding box has an associated pose, resulting in an inherent grouping of the keypoints. Unlike top-down approaches, multiple forward passes are done away with since all persons are localized along with their pose in a single inference. YOLO-pose achieves new state-of-the-art results on COCO validation (90.2% AP50) and test-dev set (90.3% AP50), surpassing all existing bottom-up approaches in a single forward pass without flip test, multi-scale testing, or any other test time augmentation. All experiments and results reported in this paper are without any test time augmentation, unlike traditional approaches that use flip-test and multi-scale testing to boost performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmpose/assets/26127467/63b9ba0c-3d28-4d5f-80b6-03f58cfb26c2">
+</div>
diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py
index 1a48b7f851..102a202e7d 100644
--- a/mmpose/codecs/__init__.py
+++ b/mmpose/codecs/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .annotation_processors import YOLOXPoseAnnotationProcessor
 from .associative_embedding import AssociativeEmbedding
 from .decoupled_heatmap import DecoupledHeatmap
 from .image_pose_lifting import ImagePoseLifting
@@ -16,5 +17,5 @@
     'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel',
     'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR',
     'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting',
-    'MotionBERTLabel'
+    'MotionBERTLabel', 'YOLOXPoseAnnotationProcessor'
 ]
diff --git a/mmpose/codecs/annotation_processors.py b/mmpose/codecs/annotation_processors.py
new file mode 100644
index 0000000000..7add52c420
--- /dev/null
+++ b/mmpose/codecs/annotation_processors.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+
+INF = 1e6
+NEG_INF = -1e6
+
+
+class BaseAnnotationProcessor(BaseKeypointCodec):
+    """Base class for annotation processors."""
+
+    def decode(self, *args, **kwargs):
+        pass
+
+
+@KEYPOINT_CODECS.register_module()
+class YOLOXPoseAnnotationProcessor(BaseAnnotationProcessor):
+    """Convert dataset annotations to the input format of YOLOX-Pose.
+
+    This processor expands bounding boxes and converts category IDs to labels.
+
+    Args:
+        expand_bbox (bool, optional): Whether to expand the bounding box
+            to include all keypoints. Defaults to False.
+        input_size (tuple, optional): The size of the input image for the
+            model, formatted as (h, w). This argument is necessary for the
+            codec in deployment but is not used indeed.
+    """
+
+    auxiliary_encode_keys = {'category_id', 'bbox'}
+    instance_mapping_table = dict(
+        bbox='bboxes',
+        bbox_lables='labels',
+        keypoints='keypoints',
+        keypoints_visible='keypoints_visible',
+        area='areas',
+    )
+
+    def __init__(self,
+                 expand_bbox: bool = False,
+                 input_size: Optional[Tuple] = None):
+        super().__init__()
+        self.expand_bbox = expand_bbox
+
+    def encode(self,
+               keypoints: Optional[np.ndarray] = None,
+               keypoints_visible: Optional[np.ndarray] = None,
+               bbox: Optional[np.ndarray] = None,
+               category_id: Optional[List[int]] = None
+               ) -> Dict[str, np.ndarray]:
+        """Encode keypoints, bounding boxes, and category IDs.
+
+        Args:
+            keypoints (np.ndarray, optional): Keypoints array. Defaults
+                to None.
+            keypoints_visible (np.ndarray, optional): Visibility array for
+                keypoints. Defaults to None.
+            bbox (np.ndarray, optional): Bounding box array. Defaults to None.
+            category_id (List[int], optional): List of category IDs. Defaults
+                to None.
+
+        Returns:
+            Dict[str, np.ndarray]: Encoded annotations.
+        """
+        results = {}
+
+        if self.expand_bbox and bbox is not None:
+            # Handle keypoints visibility
+            if keypoints_visible.ndim == 3:
+                keypoints_visible = keypoints_visible[..., 0]
+
+            # Expand bounding box to include keypoints
+            kpts_min = keypoints.copy()
+            kpts_min[keypoints_visible == 0] = INF
+            bbox[..., :2] = np.minimum(bbox[..., :2], kpts_min.min(axis=1))
+
+            kpts_max = keypoints.copy()
+            kpts_max[keypoints_visible == 0] = NEG_INF
+            bbox[..., 2:] = np.maximum(bbox[..., 2:], kpts_max.max(axis=1))
+
+            results['bbox'] = bbox
+
+        if category_id is not None:
+            # Convert category IDs to labels
+            bbox_labels = np.array(category_id).astype(np.int8) - 1
+            results['bbox_labels'] = bbox_labels
+
+        return results
diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py
index 553191fd43..48bb3fc2a4 100644
--- a/mmpose/datasets/dataset_wrappers.py
+++ b/mmpose/datasets/dataset_wrappers.py
@@ -109,6 +109,11 @@ def prepare_data(self, idx: int) -> Any:
 
         data_info = self.get_data_info(idx)
 
+        # the assignment of 'dataset' should not be performed within the
+        # `get_data_info` function. Otherwise, it can lead to the mixed
+        # data augmentation process getting stuck.
+        data_info['dataset'] = self
+
         return self.pipeline(data_info)
 
     def get_data_info(self, idx: int) -> dict:
@@ -123,6 +128,9 @@ def get_data_info(self, idx: int) -> dict:
         # Get data sample processed by ``subset.pipeline``
         data_info = self.datasets[subset_idx][sample_idx]
 
+        if 'dataset' in data_info:
+            data_info.pop('dataset')
+
         # Add metainfo items that are required in the pipeline and the model
         metainfo_keys = [
             'upper_body_ids', 'lower_body_ids', 'flip_pairs',
diff --git a/mmpose/datasets/datasets/base/base_coco_style_dataset.py b/mmpose/datasets/datasets/base/base_coco_style_dataset.py
index f0032aef9e..e0f00de5dc 100644
--- a/mmpose/datasets/datasets/base/base_coco_style_dataset.py
+++ b/mmpose/datasets/datasets/base/base_coco_style_dataset.py
@@ -2,7 +2,7 @@
 import copy
 import os.path as osp
 from copy import deepcopy
-from itertools import filterfalse, groupby
+from itertools import chain, filterfalse, groupby
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -159,6 +159,14 @@ def prepare_data(self, idx) -> Any:
         """
         data_info = self.get_data_info(idx)
 
+        # Mixed image transformations require multiple source images for
+        # effective blending. Therefore, we assign the 'dataset' field in
+        # `data_info` to provide these auxiliary images.
+        # Note: The 'dataset' assignment should not occur within the
+        # `get_data_info` function, as doing so may cause the mixed image
+        # transformations to stall or hang.
+        data_info['dataset'] = self
+
         return self.pipeline(data_info)
 
     def get_data_info(self, idx: int) -> dict:
@@ -288,6 +296,12 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
         else:
             num_keypoints = np.count_nonzero(keypoints.max(axis=2))
 
+        if 'area' in ann:
+            area = np.array(ann['area'], dtype=np.float32)
+        else:
+            area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None)
+            area = np.array(area, dtype=np.float32)
+
         data_info = {
             'img_id': ann['image_id'],
             'img_path': img['img_path'],
@@ -296,10 +310,11 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
             'num_keypoints': num_keypoints,
             'keypoints': keypoints,
             'keypoints_visible': keypoints_visible,
+            'area': area,
             'iscrowd': ann.get('iscrowd', 0),
             'segmentation': ann.get('segmentation', None),
             'id': ann['id'],
-            'category_id': ann['category_id'],
+            'category_id': np.array(ann['category_id']),
             # store the raw annotation of the instance
             # it is useful for evaluation without providing ann_file
             'raw_ann_info': copy.deepcopy(ann),
@@ -365,7 +380,13 @@ def _get_bottomup_data_infos(self, instance_list: List[Dict],
                 if key not in data_info_bu:
                     seq = [d[key] for d in data_infos]
                     if isinstance(seq[0], np.ndarray):
-                        seq = np.concatenate(seq, axis=0)
+                        if seq[0].ndim > 0:
+                            seq = np.concatenate(seq, axis=0)
+                        else:
+                            seq = np.stack(seq, axis=0)
+                    elif isinstance(seq[0], (tuple, list)):
+                        seq = list(chain.from_iterable(seq))
+
                     data_info_bu[key] = seq
 
             # The segmentation annotation of invalid objects will be used
diff --git a/mmpose/datasets/datasets/body/jhmdb_dataset.py b/mmpose/datasets/datasets/body/jhmdb_dataset.py
index 7d72a7ddc5..940a4cd4dc 100644
--- a/mmpose/datasets/datasets/body/jhmdb_dataset.py
+++ b/mmpose/datasets/datasets/body/jhmdb_dataset.py
@@ -118,6 +118,8 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
         keypoints_visible = np.minimum(1, _keypoints[..., 2])
 
         num_keypoints = np.count_nonzero(keypoints.max(axis=2))
+        area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None)
+        category_id = ann.get('category_id', [1] * len(keypoints))
 
         data_info = {
             'img_id': ann['image_id'],
@@ -127,9 +129,11 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
             'num_keypoints': num_keypoints,
             'keypoints': keypoints,
             'keypoints_visible': keypoints_visible,
+            'area': np.array(area, dtype=np.float32),
             'iscrowd': ann.get('iscrowd', 0),
             'segmentation': ann.get('segmentation', None),
             'id': ann['id'],
+            'category_id': category_id,
         }
 
         return data_info
diff --git a/mmpose/datasets/datasets/body/mpii_dataset.py b/mmpose/datasets/datasets/body/mpii_dataset.py
index bdb3797a54..5490f6f0dd 100644
--- a/mmpose/datasets/datasets/body/mpii_dataset.py
+++ b/mmpose/datasets/datasets/body/mpii_dataset.py
@@ -184,6 +184,12 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
             keypoints = np.array(ann['joints']).reshape(1, -1, 2)
             keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1)
 
+            x1, y1, x2, y2 = np.split(bbox, axis=1, indices_or_sections=4)
+            area = np.clip((x2 - x1) * (y2 - y1) * 0.53, a_min=1.0, a_max=None)
+            area = area[..., 0].astype(np.float32)
+
+            category_id = ann.get('category_id', [1] * len(bbox))
+
             instance_info = {
                 'id': ann_id,
                 'img_id': int(ann['image'].split('.')[0]),
@@ -194,6 +200,8 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                 'bbox_score': np.ones(1, dtype=np.float32),
                 'keypoints': keypoints,
                 'keypoints_visible': keypoints_visible,
+                'area': area,
+                'category_id': category_id,
             }
 
             if self.headbox_file:
diff --git a/mmpose/datasets/transforms/__init__.py b/mmpose/datasets/transforms/__init__.py
index 7ccbf7dac2..46ca6c749e 100644
--- a/mmpose/datasets/transforms/__init__.py
+++ b/mmpose/datasets/transforms/__init__.py
@@ -1,13 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .bottomup_transforms import (BottomupGetHeatmapMask, BottomupRandomAffine,
                                   BottomupResize)
-from .common_transforms import (Albumentation, GenerateTarget,
-                                GetBBoxCenterScale, PhotometricDistortion,
-                                RandomBBoxTransform, RandomFlip,
-                                RandomHalfBody)
+from .common_transforms import (Albumentation, FilterAnnotations,
+                                GenerateTarget, GetBBoxCenterScale,
+                                PhotometricDistortion, RandomBBoxTransform,
+                                RandomFlip, RandomHalfBody, YOLOXHSVRandomAug)
 from .converting import KeypointConverter
 from .formatting import PackPoseInputs
 from .loading import LoadImage
+from .mix_img_transforms import Mosaic, YOLOXMixUp
 from .pose3d_transforms import RandomFlipAroundRoot
 from .topdown_transforms import TopdownAffine
 
@@ -16,5 +17,6 @@
     'RandomHalfBody', 'TopdownAffine', 'Albumentation',
     'PhotometricDistortion', 'PackPoseInputs', 'LoadImage',
     'BottomupGetHeatmapMask', 'BottomupRandomAffine', 'BottomupResize',
-    'GenerateTarget', 'KeypointConverter', 'RandomFlipAroundRoot'
+    'GenerateTarget', 'KeypointConverter', 'RandomFlipAroundRoot',
+    'FilterAnnotations', 'YOLOXHSVRandomAug', 'YOLOXMixUp', 'Mosaic'
 ]
diff --git a/mmpose/datasets/transforms/bottomup_transforms.py b/mmpose/datasets/transforms/bottomup_transforms.py
index c31e0ae17d..5ef2fa5838 100644
--- a/mmpose/datasets/transforms/bottomup_transforms.py
+++ b/mmpose/datasets/transforms/bottomup_transforms.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional, Tuple
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
 
 import cv2
 import numpy as np
@@ -10,7 +11,10 @@
 from scipy.stats import truncnorm
 
 from mmpose.registry import TRANSFORMS
-from mmpose.structures.bbox import get_udp_warp_matrix, get_warp_matrix
+from mmpose.structures.bbox import (bbox_clip_border, bbox_corner2xyxy,
+                                    bbox_xyxy2corner, get_pers_warp_matrix,
+                                    get_udp_warp_matrix, get_warp_matrix)
+from mmpose.structures.keypoint import keypoint_clip_border
 
 
 @TRANSFORMS.register_module()
@@ -31,6 +35,10 @@ class BottomupGetHeatmapMask(BaseTransform):
         - heatmap_mask
     """
 
+    def __init__(self, get_invalid: bool = False):
+        super().__init__()
+        self.get_invalid = get_invalid
+
     def _segs_to_mask(self, segs: list, img_shape: Tuple[int,
                                                          int]) -> np.ndarray:
         """Calculate mask from object segmentations.
@@ -83,10 +91,12 @@ def transform(self, results: Dict) -> Optional[dict]:
         invalid_segs = results.get('invalid_segs', [])
         img_shape = results['img_shape']  # (img_h, img_w)
         input_size = results['input_size']
+        mask = self._segs_to_mask(invalid_segs, img_shape)
 
-        # Calculate the mask of the valid region by negating the segmentation
-        # mask of invalid objects
-        mask = 1 - self._segs_to_mask(invalid_segs, img_shape)
+        if not self.get_invalid:
+            # Calculate the mask of the valid region by negating the
+            # segmentation mask of invalid objects
+            mask = np.logical_not(mask)
 
         # Apply an affine transform to the mask if the image has been
         # transformed
@@ -176,7 +186,7 @@ class BottomupRandomAffine(BaseTransform):
     """
 
     def __init__(self,
-                 input_size: Tuple[int, int],
+                 input_size: Optional[Tuple[int, int]] = None,
                  shift_factor: float = 0.2,
                  shift_prob: float = 1.,
                  scale_factor: Tuple[float, float] = (0.75, 1.5),
@@ -184,9 +194,21 @@ def __init__(self,
                  scale_type: str = 'short',
                  rotate_factor: float = 30.,
                  rotate_prob: float = 1,
-                 use_udp: bool = False) -> None:
+                 shear_factor: float = 2.0,
+                 shear_prob: float = 1.0,
+                 use_udp: bool = False,
+                 pad_val: Union[float, Tuple[float]] = 0,
+                 border: Tuple[int, int] = (0, 0),
+                 distribution='trunc_norm',
+                 transform_mode='affine',
+                 bbox_keep_corner: bool = True,
+                 clip_border: bool = False) -> None:
         super().__init__()
 
+        assert transform_mode in ('affine', 'affine_udp', 'perspective'), \
+            f'the argument transform_mode should be either \'affine\', ' \
+            f'\'affine_udp\' or \'perspective\', but got \'{transform_mode}\''
+
         self.input_size = input_size
         self.shift_factor = shift_factor
         self.shift_prob = shift_prob
@@ -195,14 +217,39 @@ def __init__(self,
         self.scale_type = scale_type
         self.rotate_factor = rotate_factor
         self.rotate_prob = rotate_prob
+        self.shear_factor = shear_factor
+        self.shear_prob = shear_prob
+
         self.use_udp = use_udp
+        self.distribution = distribution
+        self.clip_border = clip_border
+        self.bbox_keep_corner = bbox_keep_corner
 
-    @staticmethod
-    def _truncnorm(low: float = -1.,
-                   high: float = 1.,
-                   size: tuple = ()) -> np.ndarray:
-        """Sample from a truncated normal distribution."""
-        return truncnorm.rvs(low, high, size=size).astype(np.float32)
+        self.transform_mode = transform_mode
+
+        if isinstance(pad_val, (int, float)):
+            pad_val = (pad_val, pad_val, pad_val)
+
+        if 'affine' in transform_mode:
+            self._transform = partial(
+                cv2.warpAffine, flags=cv2.INTER_LINEAR, borderValue=pad_val)
+        else:
+            self._transform = partial(cv2.warpPerspective, borderValue=pad_val)
+
+    def _random(self,
+                low: float = -1.,
+                high: float = 1.,
+                size: tuple = ()) -> np.ndarray:
+        if self.distribution == 'trunc_norm':
+            """Sample from a truncated normal distribution."""
+            return truncnorm.rvs(low, high, size=size).astype(np.float32)
+        elif self.distribution == 'uniform':
+            x = np.random.rand(*size)
+            return x * (high - low) + low
+        else:
+            raise ValueError(f'the argument `distribution` should be either'
+                             f'\'trunc_norn\' or \'uniform\', but got '
+                             f'{self.distribution}.')
 
     def _fix_aspect_ratio(self, scale: np.ndarray, aspect_ratio: float):
         """Extend the scale to match the given aspect ratio.
@@ -243,7 +290,7 @@ def _get_transform_params(self) -> Tuple:
         """
         # get offset
         if np.random.rand() < self.shift_prob:
-            offset = self._truncnorm(size=(2, )) * self.shift_factor
+            offset = self._random(size=(2, )) * self.shift_factor
         else:
             offset = np.zeros((2, ), dtype=np.float32)
 
@@ -251,17 +298,24 @@ def _get_transform_params(self) -> Tuple:
         if np.random.rand() < self.scale_prob:
             scale_min, scale_max = self.scale_factor
             scale = scale_min + (scale_max - scale_min) * (
-                self._truncnorm(size=(1, )) + 1) / 2
+                self._random(size=(1, )) + 1) / 2
         else:
             scale = np.ones(1, dtype=np.float32)
 
         # get rotation
         if np.random.rand() < self.rotate_prob:
-            rotate = self._truncnorm() * self.rotate_factor
+            rotate = self._random() * self.rotate_factor
         else:
             rotate = 0
 
-        return offset, scale, rotate
+        # get shear
+        if 'perspective' in self.transform_mode and np.random.rand(
+        ) < self.shear_prob:
+            shear = self._random(size=(2, )) * self.shear_factor
+        else:
+            shear = np.zeros((2, ), dtype=np.float32)
+
+        return offset, scale, rotate, shear
 
     def transform(self, results: Dict) -> Optional[dict]:
         """The transform function of :class:`BottomupRandomAffine` to perform
@@ -277,45 +331,77 @@ def transform(self, results: Dict) -> Optional[dict]:
             dict: Result dict with images distorted.
         """
 
-        img_h, img_w = results['img_shape']
+        img_h, img_w = results['img_shape'][:2]
         w, h = self.input_size
 
-        offset_rate, scale_rate, rotate = self._get_transform_params()
-        offset = offset_rate * [img_w, img_h]
-        scale = scale_rate * [img_w, img_h]
-        # adjust the scale to match the target aspect ratio
-        scale = self._fix_aspect_ratio(scale, aspect_ratio=w / h)
-
-        if self.use_udp:
-            center = np.array([(img_w - 1.0) / 2, (img_h - 1.0) / 2],
-                              dtype=np.float32)
-            warp_mat = get_udp_warp_matrix(
-                center=center + offset,
-                scale=scale,
-                rot=rotate,
-                output_size=(w, h))
+        offset_rate, scale_rate, rotate, shear = self._get_transform_params()
+
+        if 'affine' in self.transform_mode:
+            offset = offset_rate * [img_w, img_h]
+            scale = scale_rate * [img_w, img_h]
+            # adjust the scale to match the target aspect ratio
+            scale = self._fix_aspect_ratio(scale, aspect_ratio=w / h)
+
+            if self.transform_mode == 'affine_udp':
+                center = np.array([(img_w - 1.0) / 2, (img_h - 1.0) / 2],
+                                  dtype=np.float32)
+                warp_mat = get_udp_warp_matrix(
+                    center=center + offset,
+                    scale=scale,
+                    rot=rotate,
+                    output_size=(w, h))
+            else:
+                center = np.array([img_w / 2, img_h / 2], dtype=np.float32)
+                warp_mat = get_warp_matrix(
+                    center=center + offset,
+                    scale=scale,
+                    rot=rotate,
+                    output_size=(w, h))
+
         else:
-            center = np.array([img_w / 2, img_h / 2], dtype=np.float32)
-            warp_mat = get_warp_matrix(
-                center=center + offset,
-                scale=scale,
+            offset = offset_rate * [w, h]
+            center = np.array([w / 2, h / 2], dtype=np.float32)
+            warp_mat = get_pers_warp_matrix(
+                center=center,
+                translate=offset,
+                scale=scale_rate[0],
                 rot=rotate,
-                output_size=(w, h))
+                shear=shear)
 
         # warp image and keypoints
-        results['img'] = cv2.warpAffine(
-            results['img'], warp_mat, (int(w), int(h)), flags=cv2.INTER_LINEAR)
+        results['img'] = self._transform(results['img'], warp_mat,
+                                         (int(w), int(h)))
 
         if 'keypoints' in results:
             # Only transform (x, y) coordinates
-            results['keypoints'][..., :2] = cv2.transform(
-                results['keypoints'][..., :2], warp_mat)
+            kpts = cv2.transform(results['keypoints'], warp_mat)
+            if kpts.shape[-1] == 3:
+                kpts = kpts[..., :2] / kpts[..., 2:3]
+            results['keypoints'] = kpts
+
+            if self.clip_border:
+                results['keypoints'], results[
+                    'keypoints_visible'] = keypoint_clip_border(
+                        results['keypoints'], results['keypoints_visible'],
+                        (w, h))
 
         if 'bbox' in results:
-            bbox = np.tile(results['bbox'], 2).reshape(-1, 4, 2)
-            # corner order: left_top, left_bottom, right_top, right_bottom
-            bbox[:, 1:3, 0] = bbox[:, 0:2, 0]
-            results['bbox'] = cv2.transform(bbox, warp_mat).reshape(-1, 8)
+            bbox = bbox_xyxy2corner(results['bbox'])
+            bbox = cv2.transform(bbox, warp_mat)
+            if bbox.shape[-1] == 3:
+                bbox = bbox[..., :2] / bbox[..., 2:3]
+            if not self.bbox_keep_corner:
+                bbox = bbox_corner2xyxy(bbox)
+            if self.clip_border:
+                bbox = bbox_clip_border(bbox, (w, h))
+            results['bbox'] = bbox
+
+        if 'area' in results:
+            warp_mat_for_area = warp_mat
+            if warp_mat.shape[0] == 2:
+                aux_row = np.array([[0.0, 0.0, 1.0]], dtype=warp_mat.dtype)
+                warp_mat_for_area = np.concatenate((warp_mat, aux_row))
+            results['area'] *= np.linalg.det(warp_mat_for_area)
 
         results['input_size'] = self.input_size
         results['warp_mat'] = warp_mat
@@ -380,6 +466,7 @@ def __init__(self,
                  aug_scales: Optional[List[float]] = None,
                  size_factor: int = 32,
                  resize_mode: str = 'fit',
+                 pad_val: tuple = (0, 0, 0),
                  use_udp: bool = False):
         super().__init__()
 
@@ -388,6 +475,7 @@ def __init__(self,
         self.resize_mode = resize_mode
         self.size_factor = size_factor
         self.use_udp = use_udp
+        self.pad_val = pad_val
 
     @staticmethod
     def _ceil_to_multiple(size: Tuple[int, int], base: int):
@@ -496,7 +584,11 @@ def transform(self, results: Dict) -> Optional[dict]:
                     output_size=padded_input_size)
 
             _img = cv2.warpAffine(
-                img, warp_mat, padded_input_size, flags=cv2.INTER_LINEAR)
+                img,
+                warp_mat,
+                padded_input_size,
+                flags=cv2.INTER_LINEAR,
+                borderValue=self.pad_val)
 
             imgs.append(_img)
 
diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py
index d8591ab094..98aed11683 100644
--- a/mmpose/datasets/transforms/common_transforms.py
+++ b/mmpose/datasets/transforms/common_transforms.py
@@ -3,6 +3,7 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import cv2
 import mmcv
 import mmengine
 import numpy as np
@@ -957,6 +958,7 @@ def transform(self, results: Dict) -> Optional[dict]:
         if keypoints_visible.ndim == 3 and keypoints_visible.shape[2] == 2:
             keypoints_visible, keypoints_visible_weights = \
                 keypoints_visible[..., 0], keypoints_visible[..., 1]
+            results['keypoints_visible'] = keypoints_visible
             results['keypoints_visible_weights'] = keypoints_visible_weights
 
         # Encoded items from the encoder(s) will be updated into the results.
@@ -1074,3 +1076,178 @@ def __repr__(self) -> str:
         repr_str += ('use_dataset_keypoint_weights='
                      f'{self.use_dataset_keypoint_weights})')
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOXHSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        hue_delta (int): delta of hue. Defaults to 5.
+        saturation_delta (int): delta of saturation. Defaults to 30.
+        value_delta (int): delat of value. Defaults to 30.
+    """
+
+    def __init__(self,
+                 hue_delta: int = 5,
+                 saturation_delta: int = 30,
+                 value_delta: int = 30) -> None:
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    @cache_randomness
+    def _get_hsv_gains(self):
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        return hsv_gains
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        hsv_gains = self._get_hsv_gains()
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FilterAnnotations(BaseTransform):
+    """Eliminate undesirable annotations based on specific conditions.
+
+    This class is designed to sift through annotations by examining multiple
+    factors such as the size of the bounding box, the visibility of keypoints,
+    and the overall area. Users can fine-tune the criteria to filter out
+    instances that have excessively small bounding boxes, insufficient area,
+    or an inadequate number of visible keypoints.
+
+    Required Keys:
+
+    - bbox (np.ndarray) (optional)
+    - area (np.int64) (optional)
+    - keypoints_visible (np.ndarray) (optional)
+
+    Modified Keys:
+
+    - bbox (optional)
+    - bbox_score (optional)
+    - category_id (optional)
+    - keypoints (optional)
+    - keypoints_visible (optional)
+    - area (optional)
+
+    Args:
+        min_gt_bbox_wh (tuple[float]): Minimum width and height of ground
+            truth boxes. Default: (1., 1.)
+        min_gt_area (int): Minimum foreground area of instances.
+            Default: 1
+        min_kpt_vis (int): Minimum number of visible keypoints. Default: 1
+        by_box (bool): Filter instances with bounding boxes not meeting the
+            min_gt_bbox_wh threshold. Default: False
+        by_area (bool): Filter instances with area less than min_gt_area
+            threshold. Default: False
+        by_kpt (bool): Filter instances with keypoints_visible not meeting the
+            min_kpt_vis threshold. Default: True
+        keep_empty (bool): Whether to return None when it
+            becomes an empty bbox after filtering. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_gt_bbox_wh: Tuple[int, int] = (1, 1),
+                 min_gt_area: int = 1,
+                 min_kpt_vis: int = 1,
+                 by_box: bool = False,
+                 by_area: bool = False,
+                 by_kpt: bool = True,
+                 keep_empty: bool = True) -> None:
+
+        assert by_box or by_kpt or by_area
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_area = min_gt_area
+        self.min_kpt_vis = min_kpt_vis
+        self.by_box = by_box
+        self.by_area = by_area
+        self.by_kpt = by_kpt
+        self.keep_empty = keep_empty
+
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'keypoints' in results
+        kpts = results['keypoints']
+        if kpts.shape[0] == 0:
+            return results
+
+        tests = []
+        if self.by_box and 'bbox' in results:
+            bbox = results['bbox']
+            tests.append(
+                ((bbox[..., 2] - bbox[..., 0] > self.min_gt_bbox_wh[0]) &
+                 (bbox[..., 3] - bbox[..., 1] > self.min_gt_bbox_wh[1])))
+        if self.by_area and 'area' in results:
+            area = results['area']
+            tests.append(area >= self.min_gt_area)
+        if self.by_kpt:
+            kpts_vis = results['keypoints_visible']
+            if kpts_vis.ndim == 3:
+                kpts_vis = kpts_vis[..., 0]
+            tests.append(kpts_vis.sum(axis=1) >= self.min_kpt_vis)
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        if not keep.any():
+            if self.keep_empty:
+                return None
+
+        keys = ('bbox', 'bbox_score', 'category_id', 'keypoints',
+                'keypoints_visible', 'area')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+
+        return results
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'min_gt_bbox_wh={self.min_gt_bbox_wh}, '
+                f'min_gt_area={self.min_gt_area}, '
+                f'min_kpt_vis={self.min_kpt_vis}, '
+                f'by_box={self.by_box}, '
+                f'by_area={self.by_area}, '
+                f'by_kpt={self.by_kpt}, '
+                f'keep_empty={self.keep_empty})')
diff --git a/mmpose/datasets/transforms/mix_img_transforms.py b/mmpose/datasets/transforms/mix_img_transforms.py
new file mode 100644
index 0000000000..84d03ea5a2
--- /dev/null
+++ b/mmpose/datasets/transforms/mix_img_transforms.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta
+from collections import defaultdict
+from typing import Optional, Sequence, Tuple
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmengine.dataset.base_dataset import Compose
+from numpy import random
+
+from mmpose.registry import TRANSFORMS
+from mmpose.structures import (bbox_clip_border, flip_bbox, flip_keypoints,
+                               keypoint_clip_border)
+
+
+class MixImageTransform(BaseTransform, metaclass=ABCMeta):
+    """Abstract base class for mixup-style image data augmentation.
+
+    Args:
+        pre_transform (Optional[Sequence[str]]): A sequence of transform
+            to be applied before mixup. Defaults to None.
+        prob (float): Probability of applying the mixup transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 pre_transform: Optional[Sequence[str]] = None,
+                 prob: float = 1.0):
+
+        self.prob = prob
+
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+
+    def transform(self, results: dict) -> dict:
+        """Transform the input data dictionary using mixup-style augmentation.
+
+        Args:
+            results (dict): A dictionary containing input data.
+        """
+
+        if random.uniform(0, 1) < self.prob:
+
+            dataset = results.pop('dataset', None)
+
+            results['mixed_data_list'] = self._get_mixed_data_list(dataset)
+            results = self.apply_mix(results)
+
+            if 'mixed_data_list' in results:
+                results.pop('mixed_data_list')
+
+            results['dataset'] = dataset
+
+        return results
+
+    def _get_mixed_data_list(self, dataset):
+        """Get a list of mixed data samples from the dataset.
+
+        Args:
+            dataset: The dataset from which to sample the mixed data.
+
+        Returns:
+            List[dict]: A list of dictionaries containing mixed data samples.
+        """
+        indexes = [
+            random.randint(0, len(dataset)) for _ in range(self.num_aux_image)
+        ]
+
+        mixed_data_list = [
+            copy.deepcopy(dataset.get_data_info(index)) for index in indexes
+        ]
+
+        if self.pre_transform is not None:
+            for i, data in enumerate(mixed_data_list):
+                data.update({'dataset': dataset})
+                _results = self.pre_transform(data)
+                _results.pop('dataset')
+                mixed_data_list[i] = _results
+
+        return mixed_data_list
+
+
+@TRANSFORMS.register_module()
+class Mosaic(MixImageTransform):
+    """Mosaic augmentation. This transformation takes four input images and
+    combines them into a single output image using the mosaic technique. The
+    resulting image is composed of parts from each of the four sub-images. The
+    mosaic transform steps are as follows:
+
+    1. Choose the mosaic center as the intersection of the four images.
+    2. Select the top-left image according to the index and randomly sample
+        three more images from the custom dataset.
+    3. If an image is larger than the mosaic patch, it will be cropped.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |           |
+                |      +-----------+    pad    |
+                |      |           |           |
+                |      |  image1   +-----------+
+                |      |           |           |
+                |      |           |   image2  |
+     center_y   |----+-+-----------+-----------+
+                |    |   cropped   |           |
+                |pad |   image3    |   image4  |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+    Required Keys:
+
+    - img
+    - bbox (optional)
+    - bbox_score (optional)
+    - category_id (optional)
+    - keypoints (optional)
+    - keypoints_visible (optional)
+    - area (optional)
+
+    Modified Keys:
+
+    - img
+    - bbox (optional)
+    - bbox_score (optional)
+    - category_id (optional)
+    - keypoints (optional)
+    - keypoints_visible (optional)
+    - area (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform (Optional[Sequence[str]]): A sequence of transform
+            to be applied before mixup. Defaults to None.
+        prob (float): Probability of applying the mixup transformation.
+            Defaults to 1.0.
+    """
+
+    num_aux_image = 3
+
+    def __init__(
+        self,
+        img_scale: Tuple[int, int] = (640, 640),
+        center_range: Tuple[float, float] = (0.5, 1.5),
+        pad_val: float = 114.0,
+        pre_transform: Sequence[dict] = None,
+        prob: float = 1.0,
+    ):
+
+        super().__init__(pre_transform=pre_transform, prob=prob)
+
+        self.img_scale = img_scale
+        self.center_range = center_range
+        self.pad_val = pad_val
+
+    def apply_mix(self, results: dict) -> dict:
+        """Apply mosaic augmentation to the input data."""
+
+        assert 'mixed_data_list' in results
+        mixed_data_list = results.pop('mixed_data_list')
+        assert len(mixed_data_list) == self.num_aux_image
+
+        img, annos = self._create_mosaic_image(results, mixed_data_list)
+        bboxes = annos['bboxes']
+        kpts = annos['keypoints']
+        kpts_vis = annos['keypoints_visible']
+
+        bboxes = bbox_clip_border(bboxes, (2 * self.img_scale[0],
+                                           2 * self.img_scale[1]))
+        kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis,
+                                              (2 * self.img_scale[0],
+                                               2 * self.img_scale[1]))
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['bbox'] = bboxes
+        results['category_id'] = annos['category_id']
+        results['bbox_score'] = annos['bbox_scores']
+        results['keypoints'] = kpts
+        results['keypoints_visible'] = kpts_vis
+        results['area'] = annos['area']
+
+        return results
+
+    def _create_mosaic_image(self, results, mixed_data_list):
+        """Create the mosaic image and corresponding annotations by combining
+        four input images."""
+
+        # init mosaic image
+        img_scale_w, img_scale_h = self.img_scale
+        mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2), 3),
+                             self.pad_val,
+                             dtype=results['img'].dtype)
+
+        # calculate mosaic center
+        center = (int(random.uniform(*self.center_range) * img_scale_w),
+                  int(random.uniform(*self.center_range) * img_scale_h))
+
+        annos = defaultdict(list)
+        locs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for loc, data in zip(locs, (results, *mixed_data_list)):
+
+            # process image
+            img = data['img']
+            h, w = img.shape[:2]
+            scale_ratio = min(img_scale_h / h, img_scale_w / w)
+            img = mmcv.imresize(img,
+                                (int(w * scale_ratio), int(h * scale_ratio)))
+
+            # paste
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center, img.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img[y1_c:y2_c, x1_c:x2_c]
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+
+            # merge annotations
+            if 'bbox' in data:
+                bboxes = data['bbox']
+
+                # rescale & translate
+                bboxes *= scale_ratio
+                bboxes[..., ::2] += padw
+                bboxes[..., 1::2] += padh
+
+                annos['bboxes'].append(bboxes)
+                annos['bbox_scores'].append(data['bbox_score'])
+                annos['category_id'].append(data['category_id'])
+
+            if 'keypoints' in data:
+                kpts = data['keypoints']
+
+                # rescale & translate
+                kpts *= scale_ratio
+                kpts[..., 0] += padw
+                kpts[..., 1] += padh
+
+                annos['keypoints'].append(kpts)
+                annos['keypoints_visible'].append(data['keypoints_visible'])
+
+            if 'area' in data:
+                annos['area'].append(data['area'] * scale_ratio**2)
+
+        for key in annos:
+            annos[key] = np.concatenate(annos[key])
+        return mosaic_img, annos
+
+    def _mosaic_combine(
+        self, loc: str, center: Tuple[float, float], img_shape: Tuple[int, int]
+    ) -> Tuple[Tuple[int, int, int, int], Tuple[int, int, int, int]]:
+        """Determine the overall coordinates of the mosaic image and the
+        specific coordinates of the cropped sub-image."""
+
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+
+        x1, y1, x2, y2 = 0, 0, 0, 0
+        cx, cy = center
+        w, h = img_shape
+
+        if loc == 'top_left':
+            x1, y1, x2, y2 = max(cx - w, 0), max(cy - h, 0), cx, cy
+            crop_coord = w - (x2 - x1), h - (y2 - y1), w, h
+        elif loc == 'top_right':
+            x1, y1, x2, y2 = cx, max(cy - h, 0), min(cx + w,
+                                                     self.img_scale[0] * 2), cy
+            crop_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
+        elif loc == 'bottom_left':
+            x1, y1, x2, y2 = max(cx - w,
+                                 0), cy, cx, min(self.img_scale[1] * 2, cy + h)
+            crop_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
+        else:
+            x1, y1, x2, y2 = cx, cy, min(cx + w, self.img_scale[0] *
+                                         2), min(self.img_scale[1] * 2, cy + h)
+            crop_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+
+        return (x1, y1, x2, y2), crop_coord
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_range={self.center_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOXMixUp(MixImageTransform):
+    """MixUp data augmentation for YOLOX. This transform combines two images
+    through mixup to enhance the dataset's diversity.
+
+    Mixup Transform Steps:
+
+        1. A random image is chosen from the dataset and placed in the
+            top-left corner of the target image (after padding and resizing).
+        2. The target of the mixup transform is obtained by taking the
+            weighted average of the mixup image and the original image.
+
+    .. code:: text
+
+                         mixup transform
+                +---------------+--------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                +---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      +-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+    Required Keys:
+
+    - img
+    - bbox (optional)
+    - bbox_score (optional)
+    - category_id (optional)
+    - keypoints (optional)
+    - keypoints_visible (optional)
+    - area (optional)
+
+    Modified Keys:
+
+    - img
+    - bbox (optional)
+    - bbox_score (optional)
+    - category_id (optional)
+    - keypoints (optional)
+    - keypoints_visible (optional)
+    - area (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform (Optional[Sequence[str]]): A sequence of transform
+            to be applied before mixup. Defaults to None.
+        prob (float): Probability of applying the mixup transformation.
+            Defaults to 1.0.
+    """
+    num_aux_image = 1
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 bbox_clip_border: bool = True,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0):
+        assert isinstance(img_scale, tuple)
+        super().__init__(pre_transform=pre_transform, prob=prob)
+        self.img_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.bbox_clip_border = bbox_clip_border
+
+    def apply_mix(self, results: dict) -> dict:
+        """YOLOX MixUp transform function."""
+
+        assert 'mixed_data_list' in results
+        mixed_data_list = results.pop('mixed_data_list')
+        assert len(mixed_data_list) == self.num_aux_image
+
+        if mixed_data_list[0]['keypoints'].shape[0] == 0:
+            return results
+
+        img, annos = self._create_mixup_image(results, mixed_data_list)
+        bboxes = annos['bboxes']
+        kpts = annos['keypoints']
+        kpts_vis = annos['keypoints_visible']
+
+        h, w = img.shape[:2]
+        bboxes = bbox_clip_border(bboxes, (w, h))
+        kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis, (w, h))
+
+        results['img'] = img.astype(np.uint8)
+        results['img_shape'] = img.shape
+        results['bbox'] = bboxes
+        results['category_id'] = annos['category_id']
+        results['bbox_score'] = annos['bbox_scores']
+        results['keypoints'] = kpts
+        results['keypoints_visible'] = kpts_vis
+        results['area'] = annos['area']
+
+        return results
+
+    def _create_mixup_image(self, results, mixed_data_list):
+        """Create the mixup image and corresponding annotations by combining
+        two input images."""
+
+        aux_results = mixed_data_list[0]
+        aux_img = aux_results['img']
+
+        # init mixup image
+        out_img = np.ones((self.img_scale[1], self.img_scale[0], 3),
+                          dtype=aux_img.dtype) * self.pad_val
+        annos = defaultdict(list)
+
+        # Calculate scale ratio and resize aux_img
+        scale_ratio = min(self.img_scale[1] / aux_img.shape[0],
+                          self.img_scale[0] / aux_img.shape[1])
+        aux_img = mmcv.imresize(aux_img, (int(aux_img.shape[1] * scale_ratio),
+                                          int(aux_img.shape[0] * scale_ratio)))
+
+        # Set the resized aux_img in the top-left of out_img
+        out_img[:aux_img.shape[0], :aux_img.shape[1]] = aux_img
+
+        # random rescale
+        jit_factor = random.uniform(*self.ratio_range)
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # random flip
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+
+        # random crop
+        ori_img = results['img']
+        aux_h, aux_w = out_img.shape[:2]
+        h, w = ori_img.shape[:2]
+        padded_img = np.ones((max(aux_h, h), max(aux_w, w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:aux_h, :aux_w] = out_img
+
+        dy = random.randint(0, max(0, padded_img.shape[0] - h) + 1)
+        dx = random.randint(0, max(0, padded_img.shape[1] - w) + 1)
+        padded_cropped_img = padded_img[dy:dy + h, dx:dx + w]
+
+        # mix up
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img
+
+        # merge annotations
+        # bboxes
+        bboxes = aux_results['bbox'].copy()
+        bboxes *= scale_ratio
+        bboxes = bbox_clip_border(bboxes, (aux_w, aux_h))
+        if is_filp:
+            bboxes = flip_bbox(bboxes, [aux_w, aux_h], 'xyxy')
+        bboxes[..., ::2] -= dx
+        bboxes[..., 1::2] -= dy
+        annos['bboxes'] = [results['bbox'], bboxes]
+        annos['bbox_scores'] = [
+            results['bbox_score'], aux_results['bbox_score']
+        ]
+        annos['category_id'] = [
+            results['category_id'], aux_results['category_id']
+        ]
+
+        # keypoints
+        kpts = aux_results['keypoints'] * scale_ratio
+        kpts, kpts_vis = keypoint_clip_border(kpts,
+                                              aux_results['keypoints_visible'],
+                                              (aux_w, aux_h))
+        if is_filp:
+            kpts, kpts_vis = flip_keypoints(kpts, kpts_vis, (aux_w, aux_h),
+                                            aux_results['flip_indices'])
+        kpts[..., 0] -= dx
+        kpts[..., 1] -= dy
+        annos['keypoints'] = [results['keypoints'], kpts]
+        annos['keypoints_visible'] = [results['keypoints_visible'], kpts_vis]
+        annos['area'] = [results['area'], aux_results['area'] * scale_ratio**2]
+
+        for key in annos:
+            annos[key] = np.concatenate(annos[key])
+
+        return mixup_img, annos
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
diff --git a/mmpose/engine/__init__.py b/mmpose/engine/__init__.py
index ac85928986..44f7fa17bc 100644
--- a/mmpose/engine/__init__.py
+++ b/mmpose/engine/__init__.py
@@ -1,3 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .hooks import *  # noqa: F401, F403
 from .optim_wrappers import *  # noqa: F401, F403
+from .schedulers import *  # noqa: F401, F403
diff --git a/mmpose/engine/hooks/__init__.py b/mmpose/engine/hooks/__init__.py
index abfe762881..2c31ca081c 100644
--- a/mmpose/engine/hooks/__init__.py
+++ b/mmpose/engine/hooks/__init__.py
@@ -1,6 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .badcase_hook import BadCaseAnalysisHook
 from .ema_hook import ExpMomentumEMA
+from .mode_switch_hooks import YOLOXPoseModeSwitchHook
+from .sync_norm_hook import SyncNormHook
 from .visualization_hook import PoseVisualizationHook
 
-__all__ = ['PoseVisualizationHook', 'ExpMomentumEMA', 'BadCaseAnalysisHook']
+__all__ = [
+    'PoseVisualizationHook', 'ExpMomentumEMA', 'BadCaseAnalysisHook',
+    'YOLOXPoseModeSwitchHook', 'SyncNormHook'
+]
diff --git a/mmpose/engine/hooks/mode_switch_hooks.py b/mmpose/engine/hooks/mode_switch_hooks.py
new file mode 100644
index 0000000000..862e36dc0b
--- /dev/null
+++ b/mmpose/engine/hooks/mode_switch_hooks.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmpose.registry import HOOKS
+
+
+@HOOKS.register_module()
+class YOLOXPoseModeSwitchHook(Hook):
+    """Switch the mode of YOLOX-Pose during training.
+
+    This hook:
+    1) Turns off mosaic and mixup data augmentation.
+    2) Uses instance mask to assist positive anchor selection.
+    3) Uses auxiliary L1 loss in the head.
+
+    Args:
+        num_last_epochs (int): The number of last epochs at the end of
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 20.
+        new_train_dataset (dict): New training dataset configuration that
+            will be used in place of the original training dataset. Defaults
+            to None.
+        new_train_pipeline (Sequence[dict]): New data augmentation pipeline
+            configuration that will be used in place of the original pipeline
+            during training. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_last_epochs: int = 20,
+                 new_train_dataset: dict = None,
+                 new_train_pipeline: Sequence[dict] = None):
+        self.num_last_epochs = num_last_epochs
+        self.new_train_dataset = new_train_dataset
+        self.new_train_pipeline = new_train_pipeline
+
+    def _modify_dataloader(self, runner: Runner):
+        """Modify dataloader with new dataset and pipeline configurations."""
+        runner.logger.info(f'New Pipeline: {self.new_train_pipeline}')
+
+        train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader)
+        if self.new_train_dataset:
+            train_dataloader_cfg.dataset = self.new_train_dataset
+        if self.new_train_pipeline:
+            train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline
+
+        new_train_dataloader = Runner.build_dataloader(train_dataloader_cfg)
+        runner.train_loop.dataloader = new_train_dataloader
+        runner.logger.info('Recreated the dataloader!')
+
+    def before_train_epoch(self, runner: Runner):
+        """Close mosaic and mixup augmentation, switch to use L1 loss."""
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+
+        if epoch + 1 == runner.max_epochs - self.num_last_epochs:
+            self._modify_dataloader(runner)
+            runner.logger.info('Added additional reg loss now!')
+            model.head.use_aux_loss = True
diff --git a/mmpose/engine/hooks/sync_norm_hook.py b/mmpose/engine/hooks/sync_norm_hook.py
new file mode 100644
index 0000000000..053e4f92af
--- /dev/null
+++ b/mmpose/engine/hooks/sync_norm_hook.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmengine.dist import all_reduce_dict, get_dist_info
+from mmengine.hooks import Hook
+from torch import nn
+
+from mmpose.registry import HOOKS
+
+
+def get_norm_states(module: nn.Module) -> OrderedDict:
+    """Get the state_dict of batch norms in the module."""
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, nn.modules.batchnorm._NormBase):
+            for k, v in child.state_dict().items():
+                async_norm_states['.'.join([name, k])] = v
+    return async_norm_states
+
+
+@HOOKS.register_module()
+class SyncNormHook(Hook):
+    """Synchronize Norm states before validation."""
+
+    def before_val_epoch(self, runner):
+        """Synchronize normalization statistics."""
+        module = runner.model
+        rank, world_size = get_dist_info()
+
+        if world_size == 1:
+            return
+
+        norm_states = get_norm_states(module)
+        if len(norm_states) == 0:
+            return
+
+        try:
+            norm_states = all_reduce_dict(norm_states, op='mean')
+            module.load_state_dict(norm_states, strict=True)
+        except Exception as e:
+            runner.logger.warn(f'SyncNormHook failed: {str(e)}')
diff --git a/mmpose/engine/schedulers/__init__.py b/mmpose/engine/schedulers/__init__.py
new file mode 100644
index 0000000000..01261646fa
--- /dev/null
+++ b/mmpose/engine/schedulers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum,
+                               QuadraticWarmupParamScheduler)
+
+__all__ = [
+    'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum',
+    'QuadraticWarmupLR'
+]
diff --git a/mmpose/engine/schedulers/quadratic_warmup.py b/mmpose/engine/schedulers/quadratic_warmup.py
new file mode 100644
index 0000000000..1021797217
--- /dev/null
+++ b/mmpose/engine/schedulers/quadratic_warmup.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin
+from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin
+from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler
+from torch.optim import Optimizer
+
+from mmpose.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupParamScheduler(_ParamScheduler):
+    r"""Warm up the parameter value of each parameter group by quadratic
+    formula:
+
+    .. math::
+
+        X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base}
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if end >= INF:
+            raise ValueError('``end`` must be less than infinity,'
+                             'Please set ``end`` parameter of '
+                             '``QuadraticWarmupScheduler`` as the '
+                             'number of warmup end.')
+        self.total_iters = end - begin
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = begin * epoch_length
+        if end != INF:
+            end = end * epoch_length
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                base_value * (2 * self.last_step + 1) / self.total_iters**2
+                for base_value in self.base_values
+            ]
+
+        return [
+            group[self.param_name] + base_value *
+            (2 * self.last_step + 1) / self.total_iters**2
+            for base_value, group in zip(self.base_values,
+                                         self.optimizer.param_groups)
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler):
+    """Warm up the learning rate of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupMomentum(MomentumSchedulerMixin,
+                              QuadraticWarmupParamScheduler):
+    """Warm up the momentum value of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/mmpose/evaluation/functional/__init__.py b/mmpose/evaluation/functional/__init__.py
index 49f243163c..f59f6f5a40 100644
--- a/mmpose/evaluation/functional/__init__.py
+++ b/mmpose/evaluation/functional/__init__.py
@@ -3,10 +3,11 @@
                             keypoint_nme, keypoint_pck_accuracy,
                             multilabel_classification_accuracy,
                             pose_pck_accuracy, simcc_pck_accuracy)
-from .nms import nms, oks_nms, soft_oks_nms
+from .nms import nms, nms_torch, oks_nms, soft_oks_nms
 
 __all__ = [
     'keypoint_pck_accuracy', 'keypoint_auc', 'keypoint_nme', 'keypoint_epe',
     'pose_pck_accuracy', 'multilabel_classification_accuracy',
-    'simcc_pck_accuracy', 'nms', 'oks_nms', 'soft_oks_nms', 'keypoint_mpjpe'
+    'simcc_pck_accuracy', 'nms', 'oks_nms', 'soft_oks_nms', 'keypoint_mpjpe',
+    'nms_torch'
 ]
diff --git a/mmpose/evaluation/functional/nms.py b/mmpose/evaluation/functional/nms.py
index eed4e5cf73..801fee7764 100644
--- a/mmpose/evaluation/functional/nms.py
+++ b/mmpose/evaluation/functional/nms.py
@@ -7,6 +7,10 @@
 from typing import List, Optional
 
 import numpy as np
+import torch
+from torch import Tensor
+
+from mmpose.structures.bbox import bbox_overlaps
 
 
 def nms(dets: np.ndarray, thr: float) -> List[int]:
@@ -325,3 +329,40 @@ def nearby_joints_nms(
         keep_pose_inds = [keep_pose_inds[i] for i in sub_inds]
 
     return keep_pose_inds
+
+
+def nms_torch(bboxes: Tensor,
+              scores: Tensor,
+              threshold: float = 0.65,
+              iou_calculator=bbox_overlaps,
+              return_group: bool = False):
+    """Perform Non-Maximum Suppression (NMS) on a set of bounding boxes using
+    their corresponding scores.
+
+    Args:
+
+        bboxes (Tensor): list of bounding boxes (each containing 4 elements
+            for x1, y1, x2, y2).
+        scores (Tensor): scores associated with each bounding box.
+        threshold (float): IoU threshold to determine overlap.
+        iou_calculator (function): method to calculate IoU.
+        return_group (bool): if True, returns groups of overlapping bounding
+            boxes, otherwise returns the main bounding boxes.
+    """
+
+    _, indices = scores.sort(descending=True)
+    groups = []
+    while len(indices):
+        idx, indices = indices[0], indices[1:]
+        bbox = bboxes[idx]
+        ious = iou_calculator(bbox, bboxes[indices])
+        close_indices = torch.where(ious > threshold)[0]
+        keep_indices = torch.ones_like(indices, dtype=torch.bool)
+        keep_indices[close_indices] = 0
+        groups.append(torch.cat((idx[None], indices[close_indices])))
+        indices = indices[keep_indices]
+
+    if return_group:
+        return groups
+    else:
+        return torch.cat([g[:1] for g in groups])
diff --git a/mmpose/evaluation/metrics/coco_metric.py b/mmpose/evaluation/metrics/coco_metric.py
index 8b5e80d954..d1c7191338 100644
--- a/mmpose/evaluation/metrics/coco_metric.py
+++ b/mmpose/evaluation/metrics/coco_metric.py
@@ -13,6 +13,7 @@
 from xtcocotools.cocoeval import COCOeval
 
 from mmpose.registry import METRICS, TRANSFORMS
+from mmpose.structures.bbox import bbox_xyxy2xywh
 from ..functional import oks_nms, soft_oks_nms
 
 
@@ -213,9 +214,13 @@ def process(self, data_batch: Sequence[dict],
             pred = dict()
             pred['id'] = data_sample['id']
             pred['img_id'] = data_sample['img_id']
+
             pred['keypoints'] = keypoints
             pred['keypoint_scores'] = keypoint_scores
             pred['category_id'] = data_sample.get('category_id', 1)
+            if 'bboxes' in data_sample['pred_instances']:
+                pred['bbox'] = bbox_xyxy2xywh(
+                    data_sample['pred_instances']['bboxes'])
 
             if 'bbox_scores' in data_sample['pred_instances']:
                 # some one-stage models will predict bboxes and scores
@@ -405,6 +410,8 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
                     'keypoint_scores': pred['keypoint_scores'][idx],
                     'bbox_score': pred['bbox_scores'][idx],
                 }
+                if 'bbox' in pred:
+                    instance['bbox'] = pred['bbox'][idx]
 
                 if 'areas' in pred:
                     instance['area'] = pred['areas'][idx]
@@ -510,12 +517,17 @@ def results2json(self, keypoints: Dict[int, list],
             # collect all the person keypoints in current image
             _keypoints = _keypoints.reshape(-1, num_keypoints * 3)
 
-            result = [{
-                'image_id': img_kpt['img_id'],
-                'category_id': img_kpt['category_id'],
-                'keypoints': keypoint.tolist(),
-                'score': float(img_kpt['score']),
-            } for img_kpt, keypoint in zip(img_kpts, _keypoints)]
+            result = []
+            for img_kpt, keypoint in zip(img_kpts, _keypoints):
+                res = {
+                    'image_id': img_kpt['img_id'],
+                    'category_id': img_kpt['category_id'],
+                    'keypoints': keypoint.tolist(),
+                    'score': float(img_kpt['score']),
+                }
+                if 'bbox' in img_kpt:
+                    res['bbox'] = img_kpt['bbox'].tolist(),
+                result.append(res)
 
             cat_results.extend(result)
 
diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py
index 563264eecf..1559b6288b 100644
--- a/mmpose/models/backbones/__init__.py
+++ b/mmpose/models/backbones/__init__.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .alexnet import AlexNet
 from .cpm import CPM
+from .csp_darknet import CSPDarknet
+from .cspnext import CSPNeXt
 from .dstformer import DSTFormer
 from .hourglass import HourglassNet
 from .hourglass_ae import HourglassAENet
@@ -34,5 +36,6 @@
     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
     'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
-    'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer'
+    'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer', 'CSPDarknet',
+    'CSPNeXt'
 ]
diff --git a/mmpose/models/backbones/csp_darknet.py b/mmpose/models/backbones/csp_darknet.py
new file mode 100644
index 0000000000..dbaba0cfd9
--- /dev/null
+++ b/mmpose/models/backbones/csp_darknet.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.registry import MODELS
+from ..utils import CSPLayer
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_size (int): The kernel size of the convolution. Default: 1
+        stride (int): The stride of the convolution. Default: 1
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish')):
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels * 4,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class SPPBottleneck(BaseModule):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = in_channels // 2
+        self.conv1 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.poolings = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = mid_channels * (len(kernel_sizes) + 1)
+        self.conv2 = ConvModule(
+            conv2_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@MODELS.register_module()
+class CSPDarknet(BaseModule):
+    """CSP-Darknet backbone used in YOLOv5 and YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Default: P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Default: -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False.
+        arch_ovewrite(list): Overwrite default arch settings. Default: None.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Example:
+        >>> from mmpose.models import CSPDarknet
+        >>> import torch
+        >>> self = CSPDarknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 deepen_factor=1.0,
+                 widen_factor=1.0,
+                 out_indices=(2, 3, 4),
+                 frozen_stages=-1,
+                 use_depthwise=False,
+                 arch_ovewrite=None,
+                 spp_kernal_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super().__init__(init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        self.stem = Focus(
+            3,
+            int(arch_setting[0][0] * widen_factor),
+            kernel_size=3,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernal_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(CSPDarknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmpose/models/backbones/cspnext.py b/mmpose/models/backbones/cspnext.py
new file mode 100644
index 0000000000..5275bb255a
--- /dev/null
+++ b/mmpose/models/backbones/cspnext.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Sequence, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.registry import MODELS
+from mmpose.utils.typing import ConfigType
+from ..utils import CSPLayer
+from .csp_darknet import SPPBottleneck
+
+
+@MODELS.register_module()
+class CSPNeXt(BaseModule):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        spp_kernel_sizes: Sequence[int] = (5, 9, 13),
+        channel_attention: bool = True,
+        conv_cfg: Optional[ConfigType] = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU'),
+        norm_eval: bool = False,
+        init_cfg: Optional[ConfigType] = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernel_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                use_cspnext_block=True,
+                expand_ratio=expand_ratio,
+                channel_attention=channel_attention,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self) -> None:
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True) -> None:
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmpose/models/data_preprocessors/__init__.py b/mmpose/models/data_preprocessors/__init__.py
index 7c9bd22e2b..7abf9a6af0 100644
--- a/mmpose/models/data_preprocessors/__init__.py
+++ b/mmpose/models/data_preprocessors/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .batch_augmentation import BatchSyncRandomResize
 from .data_preprocessor import PoseDataPreprocessor
 
-__all__ = ['PoseDataPreprocessor']
+__all__ = ['PoseDataPreprocessor', 'BatchSyncRandomResize']
diff --git a/mmpose/models/data_preprocessors/batch_augmentation.py b/mmpose/models/data_preprocessors/batch_augmentation.py
new file mode 100644
index 0000000000..e4dcd568e5
--- /dev/null
+++ b/mmpose/models/data_preprocessors/batch_augmentation.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine import MessageHub
+from mmengine.dist import barrier, broadcast, get_dist_info
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmpose.registry import MODELS
+from mmpose.structures import PoseDataSample
+
+
+@MODELS.register_module()
+class BatchSyncRandomResize(nn.Module):
+    """Batch random resize which synchronizes the random size across ranks.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 10,
+                 size_divisor: int = 32) -> None:
+        super().__init__()
+        self.rank, self.world_size = get_dist_info()
+        self._input_size = None
+        self._random_size_range = (round(random_size_range[0] / size_divisor),
+                                   round(random_size_range[1] / size_divisor))
+        self._interval = interval
+        self._size_divisor = size_divisor
+
+    def forward(self, inputs: Tensor, data_samples: List[PoseDataSample]
+                ) -> Tuple[Tensor, List[PoseDataSample]]:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            for data_sample in data_samples:
+                img_shape = (int(data_sample.img_shape[0] * scale_y),
+                             int(data_sample.img_shape[1] * scale_x))
+                pad_shape = (int(data_sample.pad_shape[0] * scale_y),
+                             int(data_sample.pad_shape[1] * scale_x))
+                data_sample.set_metainfo({
+                    'img_shape': img_shape,
+                    'pad_shape': pad_shape,
+                    'batch_input_shape': self._input_size
+                })
+
+                if 'gt_instance_labels' not in data_sample:
+                    continue
+
+                if 'bboxes' in data_sample.gt_instance_labels:
+                    data_sample.gt_instance_labels.bboxes[..., 0::2] *= scale_x
+                    data_sample.gt_instance_labels.bboxes[..., 1::2] *= scale_y
+
+                if 'keypoints' in data_sample.gt_instance_labels:
+                    data_sample.gt_instance_labels.keypoints[..., 0] *= scale_x
+                    data_sample.gt_instance_labels.keypoints[..., 1] *= scale_y
+
+                if 'areas' in data_sample.gt_instance_labels:
+                    data_sample.gt_instance_labels.areas *= scale_x * scale_y
+
+                if 'gt_fields' in data_sample \
+                        and 'heatmap_mask' in data_sample.gt_fields:
+
+                    mask = data_sample.gt_fields.heatmap_mask.unsqueeze(0)
+                    gt_fields = PixelData()
+                    gt_fields.set_field(
+                        F.interpolate(
+                            mask.float(),
+                            size=self._input_size,
+                            mode='bilinear',
+                            align_corners=False).squeeze(0), 'heatmap_mask')
+
+                    data_sample.gt_fields = gt_fields
+
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+        return inputs, data_samples
+
+    def _get_random_size(self, aspect_ratio: float,
+                         device: torch.device) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and broadcast to
+        all ranks."""
+        tensor = torch.LongTensor(2).to(device)
+        if self.rank == 0:
+            size = random.randint(*self._random_size_range)
+            size = (self._size_divisor * size,
+                    self._size_divisor * int(aspect_ratio * size))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+        barrier()
+        broadcast(tensor, 0)
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
diff --git a/mmpose/models/data_preprocessors/data_preprocessor.py b/mmpose/models/data_preprocessors/data_preprocessor.py
index bcfe54ab59..b5ce1e7fdd 100644
--- a/mmpose/models/data_preprocessors/data_preprocessor.py
+++ b/mmpose/models/data_preprocessors/data_preprocessor.py
@@ -1,5 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
 from mmengine.model import ImgDataPreprocessor
+from mmengine.utils import is_seq_of
 
 from mmpose.registry import MODELS
 
@@ -7,3 +13,87 @@
 @MODELS.register_module()
 class PoseDataPreprocessor(ImgDataPreprocessor):
     """Image pre-processor for pose estimation tasks."""
+
+    def __init__(self,
+                 mean: Sequence[float] = None,
+                 std: Sequence[float] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 non_blocking: Optional[bool] = False,
+                 batch_augments: Optional[List[dict]] = None):
+        super().__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            non_blocking=non_blocking)
+        if batch_augments is not None:
+            self.batch_augments = nn.ModuleList(
+                [MODELS.build(aug) for aug in batch_augments])
+        else:
+            self.batch_augments = None
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        batch_pad_shape = self._get_pad_shape(data)
+        data = super().forward(data=data, training=training)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_input_shape = tuple(inputs[0].size()[-2:])
+        for data_sample, pad_shape in zip(data_samples, batch_pad_shape):
+            data_sample.set_metainfo({
+                'batch_input_shape': batch_input_shape,
+                'pad_shape': pad_shape
+            })
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
+
+    def _get_pad_shape(self, data: dict) -> List[tuple]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[1] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a dict '
+                            'or a tuple with inputs and data_samples, but got'
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
diff --git a/mmpose/models/heads/hybrid_heads/__init__.py b/mmpose/models/heads/hybrid_heads/__init__.py
index 6431b6a2c2..ff026ce855 100644
--- a/mmpose/models/heads/hybrid_heads/__init__.py
+++ b/mmpose/models/heads/hybrid_heads/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dekr_head import DEKRHead
 from .vis_head import VisPredictHead
+from .yoloxpose_head import YOLOXPoseHead
 
-__all__ = ['DEKRHead', 'VisPredictHead']
+__all__ = ['DEKRHead', 'VisPredictHead', 'YOLOXPoseHead']
diff --git a/mmpose/models/heads/hybrid_heads/yoloxpose_head.py b/mmpose/models/heads/hybrid_heads/yoloxpose_head.py
new file mode 100644
index 0000000000..bdd25f7851
--- /dev/null
+++ b/mmpose/models/heads/hybrid_heads/yoloxpose_head.py
@@ -0,0 +1,752 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmpose.evaluation.functional import nms_torch
+from mmpose.models.utils import filter_scores_and_topk
+from mmpose.registry import MODELS, TASK_UTILS
+from mmpose.structures import PoseDataSample
+from mmpose.utils import reduce_mean
+from mmpose.utils.typing import (ConfigType, Features, OptSampleList,
+                                 Predictions, SampleList)
+
+
+class YOLOXPoseHeadModule(BaseModule):
+    """YOLOXPose head module for one-stage human pose estimation.
+
+    This module predicts classification scores, bounding boxes, keypoint
+    offsets and visibilities from multi-level feature maps.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        num_keypoints (int): Number of keypoints defined for one instance.
+         in_channels (Union[int, Sequence]): Number of channels in the input
+             feature map.
+        feat_channels (int): Number of channels in the classification score
+            and objectness prediction branch. Defaults to 256.
+         widen_factor (float): Width multiplier, multiply number of
+             channels in each layer by this amount. Defaults to 1.0.
+        num_groups (int): Group number of group convolution layers in keypoint
+            regression branch. Defaults to 8.
+        channels_per_group (int): Number of channels for each group of group
+            convolution layers in keypoint regression branch. Defaults to 32.
+        featmap_strides (Sequence[int]): Downsample factor of each feature
+            map. Defaults to [8, 16, 32].
+        conv_bias (bool or str): If specified as `auto`, it will be decided
+            by the norm_cfg. Bias of conv will be set as True if `norm_cfg`
+            is None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_keypoints: int,
+        in_channels: Union[int, Sequence],
+        num_classes: int = 1,
+        widen_factor: float = 1.0,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: Optional[ConfigType] = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: Optional[ConfigType] = None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.feat_channels = int(feat_channels * widen_factor)
+        self.stacked_convs = stacked_convs
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.featmap_strides = featmap_strides
+
+        if isinstance(in_channels, int):
+            in_channels = int(in_channels * widen_factor)
+        self.in_channels = in_channels
+        self.num_keypoints = num_keypoints
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize heads for all level feature maps."""
+        self._init_cls_branch()
+        self._init_reg_branch()
+        self._init_pose_branch()
+
+    def _init_cls_branch(self):
+        """Initialize classification branch for all level feature maps."""
+        self.conv_cls = nn.ModuleList()
+        for _ in self.featmap_strides:
+            stacked_convs = []
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                stacked_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        bias=self.conv_bias))
+            self.conv_cls.append(nn.Sequential(*stacked_convs))
+
+        # output layers
+        self.out_cls = nn.ModuleList()
+        self.out_obj = nn.ModuleList()
+        for _ in self.featmap_strides:
+            self.out_cls.append(
+                nn.Conv2d(self.feat_channels, self.num_classes, 1))
+
+    def _init_reg_branch(self):
+        """Initialize classification branch for all level feature maps."""
+        self.conv_reg = nn.ModuleList()
+        for _ in self.featmap_strides:
+            stacked_convs = []
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                stacked_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        bias=self.conv_bias))
+            self.conv_reg.append(nn.Sequential(*stacked_convs))
+
+        # output layers
+        self.out_bbox = nn.ModuleList()
+        self.out_obj = nn.ModuleList()
+        for _ in self.featmap_strides:
+            self.out_bbox.append(nn.Conv2d(self.feat_channels, 4, 1))
+            self.out_obj.append(nn.Conv2d(self.feat_channels, 1, 1))
+
+    def _init_pose_branch(self):
+        self.conv_pose = nn.ModuleList()
+
+        for _ in self.featmap_strides:
+            stacked_convs = []
+            for i in range(self.stacked_convs * 2):
+                in_chn = self.in_channels if i == 0 else self.feat_channels
+                stacked_convs.append(
+                    ConvModule(
+                        in_chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        bias=self.conv_bias))
+            self.conv_pose.append(nn.Sequential(*stacked_convs))
+
+        # output layers
+        self.out_kpt = nn.ModuleList()
+        self.out_kpt_vis = nn.ModuleList()
+        for _ in self.featmap_strides:
+            self.out_kpt.append(
+                nn.Conv2d(self.feat_channels, self.num_keypoints * 2, 1))
+            self.out_kpt_vis.append(
+                nn.Conv2d(self.feat_channels, self.num_keypoints, 1))
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.out_cls, self.out_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            cls_scores (List[Tensor]): Classification scores for each level.
+            objectnesses (List[Tensor]): Objectness scores for each level.
+            bbox_preds (List[Tensor]): Bounding box predictions for each level.
+            kpt_offsets (List[Tensor]): Keypoint offsets for each level.
+            kpt_vis (List[Tensor]): Keypoint visibilities for each level.
+        """
+
+        cls_scores, bbox_preds, objectnesses = [], [], []
+        kpt_offsets, kpt_vis = [], []
+
+        for i in range(len(x)):
+
+            cls_feat = self.conv_cls[i](x[i])
+            reg_feat = self.conv_reg[i](x[i])
+            pose_feat = self.conv_pose[i](x[i])
+
+            cls_scores.append(self.out_cls[i](cls_feat))
+            objectnesses.append(self.out_obj[i](reg_feat))
+            bbox_preds.append(self.out_bbox[i](reg_feat))
+            kpt_offsets.append(self.out_kpt[i](pose_feat))
+            kpt_vis.append(self.out_kpt_vis[i](pose_feat))
+
+        return cls_scores, objectnesses, bbox_preds, kpt_offsets, kpt_vis
+
+
+@MODELS.register_module()
+class YOLOXPoseHead(BaseModule):
+
+    def __init__(
+        self,
+        num_keypoints: int,
+        head_module_cfg: Optional[ConfigType] = None,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        num_classes: int = 1,
+        use_aux_loss: bool = False,
+        assigner: ConfigType = None,
+        prior_generator: ConfigType = None,
+        loss_cls: Optional[ConfigType] = None,
+        loss_obj: Optional[ConfigType] = None,
+        loss_bbox: Optional[ConfigType] = None,
+        loss_oks: Optional[ConfigType] = None,
+        loss_vis: Optional[ConfigType] = None,
+        loss_bbox_aux: Optional[ConfigType] = None,
+        loss_kpt_aux: Optional[ConfigType] = None,
+        overlaps_power: float = 1.0,
+    ):
+        super().__init__()
+
+        self.featmap_sizes = None
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.use_aux_loss = use_aux_loss
+        self.num_keypoints = num_keypoints
+        self.overlaps_power = overlaps_power
+
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        if head_module_cfg is not None:
+            head_module_cfg['featmap_strides'] = featmap_strides
+            head_module_cfg['num_keypoints'] = num_keypoints
+            self.head_module = YOLOXPoseHeadModule(**head_module_cfg)
+        self.assigner = TASK_UTILS.build(assigner)
+
+        # build losses
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_obj = MODELS.build(loss_obj)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_oks = MODELS.build(loss_oks)
+        self.loss_vis = MODELS.build(loss_vis)
+        if loss_bbox_aux is not None:
+            self.loss_bbox_aux = MODELS.build(loss_bbox_aux)
+        if loss_kpt_aux is not None:
+            self.loss_kpt_aux = MODELS.build(loss_kpt_aux)
+
+    def forward(self, feats: Features):
+        assert isinstance(feats, (tuple, list))
+        return self.head_module(feats)
+
+    def loss(self,
+             feats: Tuple[Tensor],
+             batch_data_samples: OptSampleList,
+             train_cfg: ConfigType = {}) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            feats (Tuple[Tensor]): The multi-stage features
+            batch_data_samples (List[:obj:`PoseDataSample`]): The batch
+                data samples
+            train_cfg (dict): The runtime config for training process.
+                Defaults to {}
+
+        Returns:
+            dict: A dictionary of losses.
+        """
+
+        # 1. collect & reform predictions
+        cls_scores, objectnesses, bbox_preds, kpt_offsets, \
+            kpt_vis = self.forward(feats)
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+        flatten_priors = torch.cat(mlvl_priors)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = self._flatten_predictions(cls_scores)
+        flatten_bbox_preds = self._flatten_predictions(bbox_preds)
+        flatten_objectness = self._flatten_predictions(objectnesses)
+        flatten_kpt_offsets = self._flatten_predictions(kpt_offsets)
+        flatten_kpt_vis = self._flatten_predictions(kpt_vis)
+        flatten_bbox_decoded = self.decode_bbox(flatten_bbox_preds,
+                                                flatten_priors[..., :2],
+                                                flatten_priors[..., -1])
+        flatten_kpt_decoded = self.decode_kpt_reg(flatten_kpt_offsets,
+                                                  flatten_priors[..., :2],
+                                                  flatten_priors[..., -1])
+
+        # 2. generate targets
+        targets = self._get_targets(flatten_priors,
+                                    flatten_cls_scores.detach(),
+                                    flatten_objectness.detach(),
+                                    flatten_bbox_decoded.detach(),
+                                    flatten_kpt_decoded.detach(),
+                                    flatten_kpt_vis.detach(),
+                                    batch_data_samples)
+        pos_masks, cls_targets, obj_targets, obj_weights, \
+            bbox_targets, bbox_aux_targets, kpt_targets, kpt_aux_targets, \
+            vis_targets, vis_weights, pos_areas, pos_priors, group_indices, \
+            num_fg_imgs = targets
+
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_scores.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        # 3. calculate loss
+        # 3.1 objectness loss
+        losses = dict()
+
+        obj_preds = flatten_objectness.view(-1, 1)
+        losses['loss_obj'] = self.loss_obj(obj_preds, obj_targets,
+                                           obj_weights) / num_total_samples
+
+        if num_pos > 0:
+            # 3.2 bbox loss
+            bbox_preds = flatten_bbox_decoded.view(-1, 4)[pos_masks]
+            losses['loss_bbox'] = self.loss_bbox(
+                bbox_preds, bbox_targets) / num_total_samples
+
+            # 3.3 keypoint loss
+            kpt_preds = flatten_kpt_decoded.view(-1, self.num_keypoints,
+                                                 2)[pos_masks]
+            losses['loss_kpt'] = self.loss_oks(kpt_preds, kpt_targets,
+                                               vis_targets, pos_areas)
+
+            # 3.4 keypoint visibility loss
+            kpt_vis_preds = flatten_kpt_vis.view(-1,
+                                                 self.num_keypoints)[pos_masks]
+            losses['loss_vis'] = self.loss_vis(kpt_vis_preds, vis_targets,
+                                               vis_weights)
+
+            # 3.5 classification loss
+            cls_preds = flatten_cls_scores.view(-1,
+                                                self.num_classes)[pos_masks]
+            cls_targets = cls_targets.pow(self.overlaps_power).detach()
+            losses['loss_cls'] = self.loss_cls(cls_preds,
+                                               cls_targets) / num_total_samples
+
+            if self.use_aux_loss:
+                if hasattr(self, 'loss_bbox_aux'):
+                    # 3.6 auxiliary bbox regression loss
+                    bbox_preds_raw = flatten_bbox_preds.view(-1, 4)[pos_masks]
+                    losses['loss_bbox_aux'] = self.loss_bbox_aux(
+                        bbox_preds_raw, bbox_aux_targets) / num_total_samples
+
+                if hasattr(self, 'loss_kpt_aux'):
+                    # 3.7 auxiliary keypoint regression loss
+                    kpt_preds_raw = flatten_kpt_offsets.view(
+                        -1, self.num_keypoints, 2)[pos_masks]
+                    kpt_weights = vis_targets / vis_targets.size(-1)
+                    losses['loss_kpt_aux'] = self.loss_kpt_aux(
+                        kpt_preds_raw, kpt_aux_targets, kpt_weights)
+
+        return losses
+
+    @torch.no_grad()
+    def _get_targets(
+        self,
+        priors: Tensor,
+        batch_cls_scores: Tensor,
+        batch_objectness: Tensor,
+        batch_decoded_bboxes: Tensor,
+        batch_decoded_kpts: Tensor,
+        batch_kpt_vis: Tensor,
+        batch_data_samples: SampleList,
+    ):
+        num_imgs = len(batch_data_samples)
+
+        # use clip to avoid nan
+        batch_cls_scores = batch_cls_scores.clip(min=-1e4, max=1e4).sigmoid()
+        batch_objectness = batch_objectness.clip(min=-1e4, max=1e4).sigmoid()
+        batch_kpt_vis = batch_kpt_vis.clip(min=-1e4, max=1e4).sigmoid()
+        batch_cls_scores[torch.isnan(batch_cls_scores)] = 0
+        batch_objectness[torch.isnan(batch_objectness)] = 0
+
+        targets_each = []
+        for i in range(num_imgs):
+            target = self._get_targets_single(priors, batch_cls_scores[i],
+                                              batch_objectness[i],
+                                              batch_decoded_bboxes[i],
+                                              batch_decoded_kpts[i],
+                                              batch_kpt_vis[i],
+                                              batch_data_samples[i])
+            targets_each.append(target)
+
+        targets = list(zip(*targets_each))
+        for i, target in enumerate(targets):
+            if torch.is_tensor(target[0]):
+                target = tuple(filter(lambda x: x.size(0) > 0, target))
+                targets[i] = torch.cat(target)
+
+        foreground_masks, cls_targets, obj_targets, obj_weights, \
+            bbox_targets, kpt_targets, vis_targets, vis_weights, pos_areas, \
+            pos_priors, group_indices, num_pos_per_img = targets
+
+        # post-processing for targets
+        if self.use_aux_loss:
+            bbox_cxcy = (bbox_targets[:, :2] + bbox_targets[:, 2:]) / 2.0
+            bbox_wh = bbox_targets[:, 2:] - bbox_targets[:, :2]
+            bbox_aux_targets = torch.cat([
+                (bbox_cxcy - pos_priors[:, :2]) / pos_priors[:, 2:],
+                torch.log(bbox_wh / pos_priors[:, 2:] + 1e-8)
+            ],
+                                         dim=-1)
+
+            kpt_aux_targets = (kpt_targets - pos_priors[:, None, :2]) \
+                / pos_priors[:, None, 2:]
+        else:
+            bbox_aux_targets, kpt_aux_targets = None, None
+
+        return (foreground_masks, cls_targets, obj_targets, obj_weights,
+                bbox_targets, bbox_aux_targets, kpt_targets, kpt_aux_targets,
+                vis_targets, vis_weights, pos_areas, pos_priors, group_indices,
+                num_pos_per_img)
+
+    @torch.no_grad()
+    def _get_targets_single(
+        self,
+        priors: Tensor,
+        cls_scores: Tensor,
+        objectness: Tensor,
+        decoded_bboxes: Tensor,
+        decoded_kpts: Tensor,
+        kpt_vis: Tensor,
+        data_sample: PoseDataSample,
+    ) -> tuple:
+        """Compute classification, bbox, keypoints and objectness targets for
+        priors in a single image.
+
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_scores (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in xyxy format.
+            decoded_kpts (Tensor): Decoded keypoints predictions of one image,
+                a 3D-Tensor with shape [num_priors, num_keypoints, 2].
+            kpt_vis (Tensor): Keypoints visibility predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_keypoints].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            data_sample (PoseDataSample): Data sample that contains the ground
+                truth annotations for current image.
+
+        Returns:
+            # TODO: modify the description of returned values
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                bbox_aux_target (int): BBox aux targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+        # TODO: change the shape of objectness to [num_priors]
+        num_priors = priors.size(0)
+        gt_instances = data_sample.gt_instance_labels
+        num_gts = len(gt_instances)
+
+        # No target
+        if num_gts == 0:
+            cls_target = cls_scores.new_zeros((0, self.num_classes))
+            bbox_target = cls_scores.new_zeros((0, 4))
+            obj_target = cls_scores.new_zeros((num_priors, 1))
+            obj_weight = cls_scores.new_ones((num_priors, 1))
+            kpt_target = cls_scores.new_zeros((0, self.num_keypoints, 2))
+            vis_target = cls_scores.new_zeros((0, self.num_keypoints))
+            vis_weight = cls_scores.new_zeros((0, self.num_keypoints))
+            pos_areas = cls_scores.new_zeros((0, ))
+            pos_priors = priors[:0]
+            foreground_mask = cls_scores.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, obj_weight,
+                    bbox_target, kpt_target, vis_target, vis_weight, pos_areas,
+                    pos_priors, [], 0)
+
+        # assign positive samples
+        scores = cls_scores * objectness
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes,
+            scores=scores.sqrt_(),
+            priors=priors,
+            keypoints=decoded_kpts,
+            keypoints_visible=kpt_vis,
+        )
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances, gt_instances=gt_instances)
+
+        # sampling
+        pos_inds = torch.nonzero(
+            assign_result['gt_inds'] > 0, as_tuple=False).squeeze(-1).unique()
+        num_pos_per_img = pos_inds.size(0)
+        pos_gt_labels = assign_result['labels'][pos_inds]
+        pos_assigned_gt_inds = assign_result['gt_inds'][pos_inds] - 1
+
+        # bbox target
+        bbox_target = gt_instances.bboxes[pos_assigned_gt_inds.long()]
+
+        # cls target
+        max_overlaps = assign_result['max_overlaps'][pos_inds]
+        cls_target = F.one_hot(pos_gt_labels,
+                               self.num_classes) * max_overlaps.unsqueeze(-1)
+
+        # pose targets
+        kpt_target = gt_instances.keypoints[pos_assigned_gt_inds]
+        vis_target = gt_instances.keypoints_visible[pos_assigned_gt_inds]
+        if 'keypoints_visible_weights' in gt_instances:
+            vis_weight = gt_instances.keypoints_visible_weights[
+                pos_assigned_gt_inds]
+        else:
+            vis_weight = vis_target.new_ones(vis_target.shape)
+        pos_areas = gt_instances.areas[pos_assigned_gt_inds]
+
+        # obj target
+        obj_target = torch.zeros_like(objectness)
+        obj_target[pos_inds] = 1
+        obj_weight = obj_target.new_ones(obj_target.shape)
+
+        # misc
+        foreground_mask = torch.zeros_like(objectness.squeeze()).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        pos_priors = priors[pos_inds]
+        group_index = [
+            torch.where(pos_assigned_gt_inds == num)[0]
+            for num in torch.unique(pos_assigned_gt_inds)
+        ]
+
+        return (foreground_mask, cls_target, obj_target, obj_weight,
+                bbox_target, kpt_target, vis_target, vis_weight, pos_areas,
+                pos_priors, group_index, num_pos_per_img)
+
+    def predict(self,
+                feats: Features,
+                batch_data_samples: OptSampleList,
+                test_cfg: ConfigType = {}) -> Predictions:
+        """Predict results from features.
+
+        Args:
+            feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage
+                features (or multiple multi-scale features in TTA)
+            batch_data_samples (List[:obj:`PoseDataSample`]): The batch
+                data samples
+            test_cfg (dict): The runtime config for testing process. Defaults
+                to {}
+
+        Returns:
+            Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If
+            ``test_cfg['output_heatmap']==True``, return both pose and heatmap
+            prediction; otherwise only return the pose prediction.
+
+            The pose prediction is a list of ``InstanceData``, each contains
+            the following fields:
+
+                - keypoints (np.ndarray): predicted keypoint coordinates in
+                    shape (num_instances, K, D) where K is the keypoint number
+                    and D is the keypoint dimension
+                - keypoint_scores (np.ndarray): predicted keypoint scores in
+                    shape (num_instances, K)
+
+            The heatmap prediction is a list of ``PixelData``, each contains
+            the following fields:
+
+                - heatmaps (Tensor): The predicted heatmaps in shape (1, h, w)
+                    or (K+1, h, w) if keypoint heatmaps are predicted
+                - displacements (Tensor): The predicted displacement fields
+                    in shape (K*2, h, w)
+        """
+
+        cls_scores, objectnesses, bbox_preds, kpt_offsets, \
+            kpt_vis = self.forward(feats)
+
+        cfg = copy.deepcopy(test_cfg)
+
+        batch_img_metas = [d.metainfo for d in batch_data_samples]
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+
+        mlvl_strides = [
+            flatten_priors.new_full((featmap_size.numel(), ),
+                                    stride) for featmap_size, stride in zip(
+                                        featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = self._flatten_predictions(cls_scores).sigmoid()
+        flatten_bbox_preds = self._flatten_predictions(bbox_preds)
+        flatten_objectness = self._flatten_predictions(objectnesses).sigmoid()
+        flatten_kpt_offsets = self._flatten_predictions(kpt_offsets)
+        flatten_kpt_vis = self._flatten_predictions(kpt_vis).sigmoid()
+        flatten_bbox_preds = self.decode_bbox(flatten_bbox_preds,
+                                              flatten_priors, flatten_stride)
+        flatten_kpt_reg = self.decode_kpt_reg(flatten_kpt_offsets,
+                                              flatten_priors, flatten_stride)
+
+        results_list = []
+        for (bboxes, scores, objectness, kpt_reg, kpt_vis,
+             img_meta) in zip(flatten_bbox_preds, flatten_cls_scores,
+                              flatten_objectness, flatten_kpt_reg,
+                              flatten_kpt_vis, batch_img_metas):
+
+            score_thr = cfg.get('score_thr', 0.01)
+            scores *= objectness
+
+            nms_pre = cfg.get('nms_pre', 100000)
+            scores, labels = scores.max(1, keepdim=True)
+            scores, _, keep_idxs_score, results = filter_scores_and_topk(
+                scores, score_thr, nms_pre, results=dict(labels=labels[:, 0]))
+            labels = results['labels']
+
+            bboxes = bboxes[keep_idxs_score]
+            kpt_vis = kpt_vis[keep_idxs_score]
+            stride = flatten_stride[keep_idxs_score]
+            keypoints = kpt_reg[keep_idxs_score]
+
+            if bboxes.numel() > 0:
+                nms_thr = cfg.get('nms_thr', 1.0)
+                if nms_thr < 1.0:
+                    keep_idxs_nms = nms_torch(bboxes, scores, nms_thr)
+                    bboxes = bboxes[keep_idxs_nms]
+                    stride = stride[keep_idxs_nms]
+                    labels = labels[keep_idxs_nms]
+                    kpt_vis = kpt_vis[keep_idxs_nms]
+                    keypoints = keypoints[keep_idxs_nms]
+                    scores = scores[keep_idxs_nms]
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes,
+                bbox_scores=scores,
+                keypoints=keypoints,
+                keypoint_scores=kpt_vis,
+                keypoints_visible=kpt_vis)
+
+            input_size = img_meta['input_size']
+            results.bboxes[:, 0::2].clamp_(0, input_size[0])
+            results.bboxes[:, 1::2].clamp_(0, input_size[1])
+
+            results_list.append(results.numpy())
+
+        return results_list
+
+    def decode_bbox(self, pred_bboxes: torch.Tensor, priors: torch.Tensor,
+                    stride: Union[torch.Tensor, int]) -> torch.Tensor:
+        """Decode regression results (delta_x, delta_y, log_w, log_h) to
+        bounding boxes (tl_x, tl_y, br_x, br_y).
+
+        Note:
+            - batch size: B
+            - token number: N
+
+        Args:
+            pred_bboxes (torch.Tensor): Encoded boxes with shape (B, N, 4),
+                representing (delta_x, delta_y, log_w, log_h) for each box.
+            priors (torch.Tensor): Anchors coordinates, with shape (N, 2).
+            stride (torch.Tensor | int): Strides of the bboxes. It can be a
+                single value if the same stride applies to all boxes, or it
+                can be a tensor of shape (N, ) if different strides are used
+                for each box.
+
+        Returns:
+            torch.Tensor: Decoded bounding boxes with shape (N, 4),
+                representing (tl_x, tl_y, br_x, br_y) for each box.
+        """
+        stride = stride.view(1, stride.size(0), 1)
+        priors = priors.view(1, priors.size(0), 2)
+
+        xys = (pred_bboxes[..., :2] * stride) + priors
+        whs = pred_bboxes[..., 2:].exp() * stride
+
+        # Calculate bounding box corners
+        tl_x = xys[..., 0] - whs[..., 0] / 2
+        tl_y = xys[..., 1] - whs[..., 1] / 2
+        br_x = xys[..., 0] + whs[..., 0] / 2
+        br_y = xys[..., 1] + whs[..., 1] / 2
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
+
+    def decode_kpt_reg(self, pred_kpt_offsets: torch.Tensor,
+                       priors: torch.Tensor,
+                       stride: torch.Tensor) -> torch.Tensor:
+        """Decode regression results (delta_x, delta_y) to keypoints
+        coordinates (x, y).
+
+        Args:
+            pred_kpt_offsets (torch.Tensor): Encoded keypoints offsets with
+                shape (batch_size, num_anchors, num_keypoints, 2).
+            priors (torch.Tensor): Anchors coordinates with shape
+                (num_anchors, 2).
+            stride (torch.Tensor): Strides of the anchors.
+
+        Returns:
+            torch.Tensor: Decoded keypoints coordinates with shape
+                (batch_size, num_boxes, num_keypoints, 2).
+        """
+        stride = stride.view(1, stride.size(0), 1, 1)
+        priors = priors.view(1, priors.size(0), 1, 2)
+        pred_kpt_offsets = pred_kpt_offsets.reshape(
+            *pred_kpt_offsets.shape[:-1], self.num_keypoints, 2)
+
+        decoded_kpts = pred_kpt_offsets * stride + priors
+        return decoded_kpts
+
+    def _flatten_predictions(self, preds: List[Tensor]):
+        """Flattens the predictions from a list of tensors to a single
+        tensor."""
+        preds = [x.permute(0, 2, 3, 1).flatten(1, 2) for x in preds]
+        return torch.cat(preds, dim=1)
diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py
index 57ba98fe46..92ed569bab 100644
--- a/mmpose/models/losses/__init__.py
+++ b/mmpose/models/losses/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .ae_loss import AssociativeEmbeddingLoss
+from .bbox_loss import IoULoss
 from .classification_loss import BCELoss, JSDiscretLoss, KLDiscretLoss
 from .fea_dis_loss import FeaLoss
 from .heatmap_loss import (AdaptiveWingLoss, KeypointMSELoss,
@@ -7,8 +8,8 @@
 from .logit_dis_loss import KDLoss
 from .loss_wrappers import CombinedLoss, MultipleLossWrapper
 from .regression_loss import (BoneLoss, L1Loss, MPJPELoss,
-                              MPJPEVelocityJointLoss, MSELoss, RLELoss,
-                              SemiSupervisionLoss, SmoothL1Loss,
+                              MPJPEVelocityJointLoss, MSELoss, OKSLoss,
+                              RLELoss, SemiSupervisionLoss, SmoothL1Loss,
                               SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss)
 
 __all__ = [
@@ -17,5 +18,5 @@
     'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss',
     'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss', 'CombinedLoss',
     'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss',
-    'MPJPEVelocityJointLoss', 'FeaLoss', 'KDLoss'
+    'MPJPEVelocityJointLoss', 'FeaLoss', 'KDLoss', 'OKSLoss', 'IoULoss'
 ]
diff --git a/mmpose/models/losses/bbox_loss.py b/mmpose/models/losses/bbox_loss.py
new file mode 100644
index 0000000000..b216dcdb4a
--- /dev/null
+++ b/mmpose/models/losses/bbox_loss.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmpose.registry import MODELS
+from mmpose.structures.bbox import bbox_overlaps
+
+
+@MODELS.register_module()
+class IoULoss(nn.Module):
+    """Binary Cross Entropy loss.
+
+    Args:
+        reduction (str): Options are "none", "mean" and "sum".
+        eps (float): Epsilon to avoid log(0).
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 mode='log',
+                 eps: float = 1e-16,
+                 loss_weight=1.):
+        super().__init__()
+
+        assert reduction in ('mean', 'sum', 'none'), f'the argument ' \
+            f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \
+            f'but got {reduction}'
+
+        assert mode in ('linear', 'square', 'log'), f'the argument ' \
+            f'`reduction` should be either \'linear\', \'square\' or ' \
+            f'\'log\', but got {mode}'
+
+        self.reduction = reduction
+        self.criterion = partial(F.cross_entropy, reduction='none')
+        self.loss_weight = loss_weight
+        self.mode = mode
+        self.eps = eps
+
+    def forward(self, output, target):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+        """
+        ious = bbox_overlaps(
+            output, target, is_aligned=True).clamp(min=self.eps)
+
+        if self.mode == 'linear':
+            loss = 1 - ious
+        elif self.mode == 'square':
+            loss = 1 - ious.pow(2)
+        elif self.mode == 'log':
+            loss = -ious.log()
+        else:
+            raise NotImplementedError
+
+        if self.reduction == 'sum':
+            loss = loss.sum()
+        elif self.reduction == 'mean':
+            loss = loss.mean()
+
+        return loss * self.loss_weight
diff --git a/mmpose/models/losses/classification_loss.py b/mmpose/models/losses/classification_loss.py
index 5d2a2c7a58..2421e74819 100644
--- a/mmpose/models/losses/classification_loss.py
+++ b/mmpose/models/losses/classification_loss.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -13,6 +15,7 @@ class BCELoss(nn.Module):
     Args:
         use_target_weight (bool): Option to use weighted loss.
             Different joint types may have different target weights.
+        reduction (str): Options are "none", "mean" and "sum".
         loss_weight (float): Weight of the loss. Default: 1.0.
         use_sigmoid (bool, optional): Whether the prediction uses sigmoid
             before output. Defaults to False.
@@ -21,11 +24,19 @@ class BCELoss(nn.Module):
     def __init__(self,
                  use_target_weight=False,
                  loss_weight=1.,
+                 reduction='mean',
                  use_sigmoid=False):
         super().__init__()
+
+        assert reduction in ('mean', 'sum', 'none'), f'the argument ' \
+            f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \
+            f'but got {reduction}'
+
+        self.reduction = reduction
         self.use_sigmoid = use_sigmoid
-        self.criterion = F.binary_cross_entropy if use_sigmoid \
+        criterion = F.binary_cross_entropy if use_sigmoid \
             else F.binary_cross_entropy_with_logits
+        self.criterion = partial(criterion, reduction='none')
         self.use_target_weight = use_target_weight
         self.loss_weight = loss_weight
 
@@ -45,13 +56,18 @@ def forward(self, output, target, target_weight=None):
 
         if self.use_target_weight:
             assert target_weight is not None
-            loss = self.criterion(output, target, reduction='none')
+            loss = self.criterion(output, target)
             if target_weight.dim() == 1:
                 target_weight = target_weight[:, None]
-            loss = (loss * target_weight).mean()
+            loss = (loss * target_weight)
         else:
             loss = self.criterion(output, target)
 
+        if self.reduction == 'sum':
+            loss = loss.sum()
+        elif self.reduction == 'mean':
+            loss = loss.mean()
+
         return loss * self.loss_weight
 
 
diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py
index b50ad99f04..948d65bae7 100644
--- a/mmpose/models/losses/regression_loss.py
+++ b/mmpose/models/losses/regression_loss.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 from functools import partial
+from typing import Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from mmpose.datasets.datasets.utils import parse_pose_metainfo
 from mmpose.registry import MODELS
 from ..utils.realnvp import RealNVP
 
@@ -485,11 +487,19 @@ def forward(self, output, target, target_weight=None):
 
 @MODELS.register_module()
 class L1Loss(nn.Module):
-    """L1Loss loss ."""
+    """L1Loss loss."""
 
-    def __init__(self, use_target_weight=False, loss_weight=1.):
+    def __init__(self,
+                 reduction='mean',
+                 use_target_weight=False,
+                 loss_weight=1.):
         super().__init__()
-        self.criterion = F.l1_loss
+
+        assert reduction in ('mean', 'sum', 'none'), f'the argument ' \
+            f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \
+            f'but got {reduction}'
+
+        self.criterion = partial(F.l1_loss, reduction=reduction)
         self.use_target_weight = use_target_weight
         self.loss_weight = loss_weight
 
@@ -508,6 +518,8 @@ def forward(self, output, target, target_weight=None):
         """
         if self.use_target_weight:
             assert target_weight is not None
+            for _ in range(target.ndim - target_weight.ndim):
+                target_weight = target_weight.unsqueeze(-1)
             loss = self.criterion(output * target_weight,
                                   target * target_weight)
         else:
@@ -694,3 +706,108 @@ def forward(self, output, target):
         losses['bone_loss'] = loss_bone
 
         return losses
+
+
+@MODELS.register_module()
+class OKSLoss(nn.Module):
+    """A PyTorch implementation of the Object Keypoint Similarity (OKS) loss as
+    described in the paper "YOLO-Pose: Enhancing YOLO for Multi Person Pose
+    Estimation Using Object Keypoint Similarity Loss" by Debapriya et al.
+    (2022).
+
+    The OKS loss is used for keypoint-based object recognition and consists
+    of a measure of the similarity between predicted and ground truth
+    keypoint locations, adjusted by the size of the object in the image.
+
+    The loss function takes as input the predicted keypoint locations, the
+    ground truth keypoint locations, a mask indicating which keypoints are
+    valid, and bounding boxes for the objects.
+
+    Args:
+        metainfo (Optional[str]): Path to a JSON file containing information
+            about the dataset's annotations.
+        reduction (str): Options are "none", "mean" and "sum".
+        eps (float): Epsilon to avoid log(0).
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'linear'
+        norm_target_weight (bool): whether to normalize the target weight
+            with number of visible keypoints. Defaults to False.
+    """
+
+    def __init__(self,
+                 metainfo: Optional[str] = None,
+                 reduction='mean',
+                 mode='linear',
+                 eps=1e-8,
+                 norm_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+
+        assert reduction in ('mean', 'sum', 'none'), f'the argument ' \
+            f'`reduction` should be either \'mean\', \'sum\' or \'none\', ' \
+            f'but got {reduction}'
+
+        assert mode in ('linear', 'square', 'log'), f'the argument ' \
+            f'`reduction` should be either \'linear\', \'square\' or ' \
+            f'\'log\', but got {mode}'
+
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.mode = mode
+        self.norm_target_weight = norm_target_weight
+        self.eps = eps
+
+        if metainfo is not None:
+            metainfo = parse_pose_metainfo(dict(from_file=metainfo))
+            sigmas = metainfo.get('sigmas', None)
+            if sigmas is not None:
+                self.register_buffer('sigmas', torch.as_tensor(sigmas))
+
+    def forward(self, output, target, target_weight=None, areas=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints coordinates.
+            target (torch.Tensor[N, K, 2]): Target keypoints coordinates..
+            target_weight (torch.Tensor[N, K]): Loss weight for each keypoint.
+            areas (torch.Tensor[N]): Instance size which is adopted as
+                normalization factor.
+        """
+        dist = torch.norm(output - target, dim=-1)
+        if areas is not None:
+            dist = dist / areas.pow(0.5).clip(min=self.eps).unsqueeze(-1)
+        if hasattr(self, 'sigmas'):
+            sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1)
+            dist = dist / (sigmas * 2)
+
+        oks = torch.exp(-dist.pow(2) / 2)
+
+        if target_weight is not None:
+            if self.norm_target_weight:
+                target_weight = target_weight / target_weight.sum(
+                    dim=-1, keepdims=True).clip(min=self.eps)
+            else:
+                target_weight = target_weight / target_weight.size(-1)
+            oks = oks * target_weight
+        oks = oks.sum(dim=-1)
+
+        if self.mode == 'linear':
+            loss = 1 - oks
+        elif self.mode == 'square':
+            loss = 1 - oks.pow(2)
+        elif self.mode == 'log':
+            loss = -oks.log()
+        else:
+            raise NotImplementedError()
+
+        if self.reduction == 'sum':
+            loss = loss.sum()
+        elif self.reduction == 'mean':
+            loss = loss.mean()
+
+        return loss * self.loss_weight
diff --git a/mmpose/models/necks/__init__.py b/mmpose/models/necks/__init__.py
index b4f9105cb3..c9d14cefc8 100644
--- a/mmpose/models/necks/__init__.py
+++ b/mmpose/models/necks/__init__.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .channel_mapper import ChannelMapper
 from .fmap_proc_neck import FeatureMapProcessor
 from .fpn import FPN
 from .gap_neck import GlobalAveragePooling
 from .posewarper_neck import PoseWarperNeck
+from .yolox_pafpn import YOLOXPAFPN
 
 __all__ = [
-    'GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'FeatureMapProcessor'
+    'GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'FeatureMapProcessor',
+    'ChannelMapper', 'YOLOXPAFPN'
 ]
diff --git a/mmpose/models/necks/channel_mapper.py b/mmpose/models/necks/channel_mapper.py
new file mode 100644
index 0000000000..246ed363d8
--- /dev/null
+++ b/mmpose/models/necks/channel_mapper.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmpose.registry import MODELS
+from mmpose.utils.typing import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class ChannelMapper(BaseModule):
+    """Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Default: None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Default: dict(type='ReLU').
+        num_outs (int, optional): Number of output feature maps. There would
+            be extra_convs when num_outs larger than the length of in_channels.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict],
+            optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        kernel_size: int = 3,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = dict(type='ReLU'),
+        num_outs: int = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(
+                        in_channel,
+                        out_channels,
+                        3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmpose/models/necks/yolox_pafpn.py b/mmpose/models/necks/yolox_pafpn.py
new file mode 100644
index 0000000000..adc4cfffa3
--- /dev/null
+++ b/mmpose/models/necks/yolox_pafpn.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmpose.registry import MODELS
+from ..utils import CSPLayer
+
+
+@MODELS.register_module()
+class YOLOXPAFPN(BaseModule):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_csp_blocks=3,
+                 use_depthwise=False,
+                 upsample_cfg=dict(scale_factor=2, mode='nearest'),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super(YOLOXPAFPN, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                ConvModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/mmpose/models/pose_estimators/base.py b/mmpose/models/pose_estimators/base.py
index 0ae921d0ec..e98b2caeb8 100644
--- a/mmpose/models/pose_estimators/base.py
+++ b/mmpose/models/pose_estimators/base.py
@@ -3,6 +3,8 @@
 from typing import Tuple, Union
 
 import torch
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
 from mmengine.model import BaseModel
 from torch import Tensor
 
@@ -22,6 +24,7 @@ class BasePoseEstimator(BaseModel, metaclass=ABCMeta):
             config of :class:`BaseDataPreprocessor`. Defaults to ``None``
         init_cfg (dict | ConfigDict): The model initialization config.
             Defaults to ``None``
+        use_syncbn (bool): whether to use SyncBatchNorm. Defaults to False.
         metainfo (dict): Meta information for dataset, such as keypoints
             definition and properties. If set, the metainfo of the input data
             batch will be overridden. For more details, please refer to
@@ -38,11 +41,14 @@ def __init__(self,
                  train_cfg: OptConfigType = None,
                  test_cfg: OptConfigType = None,
                  data_preprocessor: OptConfigType = None,
+                 use_syncbn: bool = False,
                  init_cfg: OptMultiConfig = None,
                  metainfo: Optional[dict] = None):
         super().__init__(
             data_preprocessor=data_preprocessor, init_cfg=init_cfg)
         self.metainfo = self._load_metainfo(metainfo)
+        self.train_cfg = train_cfg if train_cfg else {}
+        self.test_cfg = test_cfg if test_cfg else {}
 
         self.backbone = MODELS.build(backbone)
 
@@ -57,13 +63,16 @@ def __init__(self,
 
         if head is not None:
             self.head = MODELS.build(head)
-
-        self.train_cfg = train_cfg if train_cfg else {}
-        self.test_cfg = test_cfg if test_cfg else {}
+            self.head.test_cfg = self.test_cfg.copy()
 
         # Register the hook to automatically convert old version state dicts
         self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)
 
+        # TODO： Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')
+
     @property
     def with_neck(self) -> bool:
         """bool: whether the pose estimator has a neck."""
diff --git a/mmpose/models/pose_estimators/bottomup.py b/mmpose/models/pose_estimators/bottomup.py
index e7d2aaef88..7b82980a13 100644
--- a/mmpose/models/pose_estimators/bottomup.py
+++ b/mmpose/models/pose_estimators/bottomup.py
@@ -23,6 +23,7 @@ class BottomupPoseEstimator(BasePoseEstimator):
             Defaults to ``None``
         test_cfg (dict, optional): The runtime config for testing process.
             Defaults to ``None``
+        use_syncbn (bool): whether to use SyncBatchNorm. Defaults to False.
         data_preprocessor (dict, optional): The data preprocessing config to
             build the instance of :class:`BaseDataPreprocessor`. Defaults to
             ``None``.
@@ -36,6 +37,7 @@ def __init__(self,
                  head: OptConfigType = None,
                  train_cfg: OptConfigType = None,
                  test_cfg: OptConfigType = None,
+                 use_syncbn: bool = False,
                  data_preprocessor: OptConfigType = None,
                  init_cfg: OptMultiConfig = None):
         super().__init__(
@@ -44,6 +46,7 @@ def __init__(self,
             head=head,
             train_cfg=train_cfg,
             test_cfg=test_cfg,
+            use_syncbn=use_syncbn,
             data_preprocessor=data_preprocessor,
             init_cfg=init_cfg)
 
@@ -162,17 +165,25 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
         for pred_instances, pred_fields, data_sample in zip_longest(
                 batch_pred_instances, batch_pred_fields, batch_data_samples):
 
-            # convert keypoint coordinates from input space to image space
             input_size = data_sample.metainfo['input_size']
             input_center = data_sample.metainfo['input_center']
             input_scale = data_sample.metainfo['input_scale']
 
+            # convert keypoint coordinates from input space to image space
             pred_instances.keypoints = pred_instances.keypoints / input_size \
                 * input_scale + input_center - 0.5 * input_scale
             if 'keypoints_visible' not in pred_instances:
                 pred_instances.keypoints_visible = \
                     pred_instances.keypoint_scores
 
+            # convert bbox coordinates from input space to image space
+            if 'bboxes' in pred_instances:
+                bboxes = pred_instances.bboxes.reshape(
+                    pred_instances.bboxes.shape[0], 2, 2)
+                bboxes = bboxes / input_size * input_scale + input_center \
+                    - 0.5 * input_scale
+                pred_instances.bboxes = bboxes.reshape(bboxes.shape[0], 4)
+
             data_sample.pred_instances = pred_instances
 
             if pred_fields is not None:
diff --git a/mmpose/models/task_modules/__init__.py b/mmpose/models/task_modules/__init__.py
new file mode 100644
index 0000000000..caecfb9d33
--- /dev/null
+++ b/mmpose/models/task_modules/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa
+from .prior_generators import *  # noqa
diff --git a/mmpose/models/task_modules/assigners/__init__.py b/mmpose/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000..7b6b006e38
--- /dev/null
+++ b/mmpose/models/task_modules/assigners/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .metric_calculators import BBoxOverlaps2D, PoseOKS
+from .sim_ota_assigner import SimOTAAssigner
+
+__all__ = ['SimOTAAssigner', 'PoseOKS', 'BBoxOverlaps2D']
diff --git a/mmpose/models/task_modules/assigners/metric_calculators.py b/mmpose/models/task_modules/assigners/metric_calculators.py
new file mode 100644
index 0000000000..ebf4333b66
--- /dev/null
+++ b/mmpose/models/task_modules/assigners/metric_calculators.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from mmpose.datasets.datasets.utils import parse_pose_metainfo
+from mmpose.registry import TASK_UTILS
+from mmpose.structures.bbox import bbox_overlaps
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+@TASK_UTILS.register_module()
+class BBoxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    @torch.no_grad()
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, or shape (m, 5) in <x1, y1, x2,
+                y2, score> format.
+            bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, shape (m, 5) in <x1, y1, x2, y2,
+                score> format, or be empty. If ``is_aligned `` is ``True``,
+                then m and n must be equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class PoseOKS:
+    """OKS score Calculator."""
+
+    def __init__(self,
+                 metainfo: Optional[str] = 'configs/_base_/datasets/coco.py'):
+
+        if metainfo is not None:
+            metainfo = parse_pose_metainfo(dict(from_file=metainfo))
+            sigmas = metainfo.get('sigmas', None)
+            if sigmas is not None:
+                self.sigmas = torch.as_tensor(sigmas)
+
+    @torch.no_grad()
+    def __call__(self,
+                 output: Tensor,
+                 target: Tensor,
+                 target_weights: Tensor,
+                 areas: Tensor,
+                 eps: float = 1e-8) -> Tensor:
+
+        dist = torch.norm(output - target, dim=-1)
+        areas = areas.reshape(*((1, ) * (dist.ndim - 2)), -1, 1)
+        dist = dist / areas.pow(0.5).clip(min=eps)
+
+        if hasattr(self, 'sigmas'):
+            if self.sigmas.device != dist.device:
+                self.sigmas = self.sigmas.to(dist.device)
+            sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1)
+            dist = dist / (sigmas * 2)
+
+        target_weights = target_weights / target_weights.sum(
+            dim=-1, keepdims=True).clip(min=eps)
+        oks = (torch.exp(-dist.pow(2) / 2) * target_weights).sum(dim=-1)
+        return oks
diff --git a/mmpose/models/task_modules/assigners/sim_ota_assigner.py b/mmpose/models/task_modules/assigners/sim_ota_assigner.py
new file mode 100644
index 0000000000..69c7ed677e
--- /dev/null
+++ b/mmpose/models/task_modules/assigners/sim_ota_assigner.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmpose.registry import TASK_UTILS
+from mmpose.utils.typing import ConfigType
+
+INF = 100000.0
+EPS = 1.0e-7
+
+
+@TASK_UTILS.register_module()
+class SimOTAAssigner:
+    """Computes matching between predictions and ground truth.
+
+    Args:
+        center_radius (float): Radius of center area to determine
+            if a prior is in the center of a gt. Defaults to 2.5.
+        candidate_topk (int): Top-k ious candidates to calculate dynamic-k.
+            Defaults to 10.
+        iou_weight (float): Weight of bbox iou cost. Defaults to 3.0.
+        cls_weight (float): Weight of classification cost. Defaults to 1.0.
+        oks_weight (float): Weight of keypoint OKS cost. Defaults to 3.0.
+        vis_weight (float): Weight of keypoint visibility cost. Defaults to 0.0
+        dynamic_k_indicator (str): Cost type for calculating dynamic-k,
+            either 'iou' or 'oks'. Defaults to 'iou'.
+        iou_calculator (dict): Config of IoU calculation method.
+            Defaults to dict(type='BBoxOverlaps2D').
+        oks_calculator (dict): Config of OKS calculation method.
+            Defaults to dict(type='PoseOKS').
+    """
+
+    def __init__(self,
+                 center_radius: float = 2.5,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0,
+                 oks_weight: float = 3.0,
+                 vis_weight: float = 0.0,
+                 dynamic_k_indicator: str = 'iou',
+                 iou_calculator: ConfigType = dict(type='BBoxOverlaps2D'),
+                 oks_calculator: ConfigType = dict(type='PoseOKS')):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.oks_weight = oks_weight
+        self.vis_weight = vis_weight
+        assert dynamic_k_indicator in ('iou', 'oks'), f'the argument ' \
+            f'`dynamic_k_indicator` should be either \'iou\' or \'oks\', ' \
+            f'but got {dynamic_k_indicator}'
+        self.dynamic_k_indicator = dynamic_k_indicator
+
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.oks_calculator = TASK_UTILS.build(oks_calculator)
+
+    def assign(self, pred_instances: InstanceData, gt_instances: InstanceData,
+               **kwargs) -> dict:
+        """Assign gt to priors using SimOTA.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+        Returns:
+            dict: Assignment result containing assigned gt indices,
+                max iou overlaps, assigned labels, etc.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_keypoints = gt_instances.keypoints
+        gt_keypoints_visible = gt_instances.keypoints_visible
+        gt_areas = gt_instances.areas
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        keypoints = pred_instances.keypoints
+        keypoints_visible = pred_instances.keypoints_visible
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return dict(
+                num_gts=num_gt,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        valid_pred_kpts = keypoints[valid_mask]
+        valid_pred_kpts_vis = keypoints_visible[valid_mask]
+
+        num_valid = valid_decoded_bbox.size(0)
+        if num_valid == 0:
+            # No valid bboxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return dict(
+                num_gts=num_gt,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        cost_matrix = (~is_in_boxes_and_center) * INF
+
+        # calculate iou
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        if self.iou_weight > 0:
+            iou_cost = -torch.log(pairwise_ious + EPS)
+            cost_matrix = cost_matrix + iou_cost * self.iou_weight
+
+        # calculate oks
+        if self.oks_weight > 0 or self.dynamic_k_indicator == 'oks':
+            pairwise_oks = self.oks_calculator(
+                valid_pred_kpts.unsqueeze(1),  # [num_valid, 1, k, 2]
+                target=gt_keypoints.unsqueeze(0),  # [1, num_gt, k, 2]
+                target_weights=gt_keypoints_visible.unsqueeze(
+                    0),  # [1, num_gt, k]
+                areas=gt_areas.unsqueeze(0),  # [1, num_gt]
+            )  # -> [num_valid, num_gt]
+
+            oks_cost = -torch.log(pairwise_oks + EPS)
+            cost_matrix = cost_matrix + oks_cost * self.oks_weight
+
+        # calculate cls
+        if self.cls_weight > 0:
+            gt_onehot_label = (
+                F.one_hot(gt_labels.to(torch.int64),
+                          pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                              num_valid, 1, 1))
+            valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(
+                1, num_gt, 1)
+            # disable AMP autocast to avoid overflow
+            with torch.cuda.amp.autocast(enabled=False):
+                cls_cost = (
+                    F.binary_cross_entropy(
+                        valid_pred_scores.to(dtype=torch.float32),
+                        gt_onehot_label,
+                        reduction='none',
+                    ).sum(-1).to(dtype=valid_pred_scores.dtype))
+            cost_matrix = cost_matrix + cls_cost * self.cls_weight
+        # calculate vis
+        if self.vis_weight > 0:
+            valid_pred_kpts_vis = valid_pred_kpts_vis.unsqueeze(1).repeat(
+                1, num_gt, 1)  # [num_valid, 1, k]
+            gt_kpt_vis = gt_keypoints_visible.unsqueeze(
+                0).float()  # [1, num_gt, k]
+            with torch.cuda.amp.autocast(enabled=False):
+                vis_cost = (
+                    F.binary_cross_entropy(
+                        valid_pred_kpts_vis.to(dtype=torch.float32),
+                        gt_kpt_vis.repeat(num_valid, 1, 1),
+                        reduction='none',
+                    ).sum(-1).to(dtype=valid_pred_kpts_vis.dtype))
+            cost_matrix = cost_matrix + vis_cost * self.vis_weight
+
+        if self.dynamic_k_indicator == 'iou':
+            matched_pred_ious, matched_gt_inds = \
+                self.dynamic_k_matching(
+                    cost_matrix, pairwise_ious, num_gt, valid_mask)
+        elif self.dynamic_k_indicator == 'oks':
+            matched_pred_ious, matched_gt_inds = \
+                self.dynamic_k_matching(
+                    cost_matrix, pairwise_oks, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious.to(max_overlaps)
+        return dict(
+            num_gts=num_gt,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=max_overlaps,
+            labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(self, priors: Tensor, gt_bboxes: Tensor
+                                     ) -> Tuple[Tensor, Tensor]:
+        """Get the information of which prior is in gt bboxes and gt center
+        priors."""
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets."""
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/mmpose/models/task_modules/prior_generators/__init__.py b/mmpose/models/task_modules/prior_generators/__init__.py
new file mode 100644
index 0000000000..e153da8447
--- /dev/null
+++ b/mmpose/models/task_modules/prior_generators/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mlvl_point_generator import MlvlPointGenerator  # noqa
diff --git a/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py b/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py
new file mode 100644
index 0000000000..7dc6a6199b
--- /dev/null
+++ b/mmpose/models/task_modules/prior_generators/mlvl_point_generator.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmpose.registry import TASK_UTILS
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 offset: float = 0.5) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda',
+                    with_stride: bool = False) -> List[Tensor]:
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors will be
+                put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                dtype=dtype,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda',
+                                 with_stride: bool = False) -> Tensor:
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) +
+                   self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) +
+                   self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0], ),
+                                         stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0], ),
+                                         stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple[int],
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                arrange as (h, w).
+            device (str | torch.device): The device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str | torch.device): The device where the flags will be
+            put on. Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py
index 22d8a89b41..545fc4c64d 100644
--- a/mmpose/models/utils/__init__.py
+++ b/mmpose/models/utils/__init__.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .check_and_update_config import check_and_update_config
 from .ckpt_convert import pvt_convert
+from .csp_layer import CSPLayer
+from .misc import filter_scores_and_topk
 from .rtmcc_block import RTMCCBlock, rope
 from .transformer import PatchEmbed, nchw_to_nlc, nlc_to_nchw
 
 __all__ = [
     'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert', 'RTMCCBlock',
-    'rope', 'check_and_update_config'
+    'rope', 'check_and_update_config', 'filter_scores_and_topk', 'CSPLayer'
 ]
diff --git a/mmpose/models/utils/csp_layer.py b/mmpose/models/utils/csp_layer.py
new file mode 100644
index 0000000000..071e1209a2
--- /dev/null
+++ b/mmpose/models/utils/csp_layer.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from mmengine.utils import digit_version
+from torch import Tensor
+
+from mmpose.utils.typing import ConfigType, OptConfigType, OptMultiConfig
+
+
+class ChannelAttention(BaseModule):
+    """Channel attention Module.
+
+    Args:
+        channels (int): The input (and output) channels of the attention layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+        if digit_version(torch.__version__) < (1, 7, 0):
+            self.act = nn.Hardsigmoid()
+        else:
+            self.act = nn.Hardsigmoid(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for ChannelAttention."""
+        with torch.cuda.amp.autocast(enabled=False):
+            out = self.global_avgpool(x)
+        out = self.fc(out)
+        out = self.act(out)
+        return x * out
+
+
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): The kernel size of the convolution.
+            Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPNeXtBlock(BaseModule):
+    """The basic bottleneck block used in CSPNeXt.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): Expand ratio of the hidden channel. Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out. Only works
+            when in_channels == out_channels. Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        kernel_size (int): The kernel size of the second convolution layer.
+            Defaults to 5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 kernel_size: int = 5,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = conv(
+            in_channels,
+            hidden_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = DepthwiseSeparableConvModule(
+            hidden_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        num_blocks (int): Number of blocks. Defaults to 1.
+        add_identity (bool): Whether to add identity in blocks.
+            Defaults to True.
+        use_cspnext_block (bool): Whether to use CSPNeXt block.
+            Defaults to False.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish')
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: float = 0.5,
+                 num_blocks: int = 1,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 use_cspnext_block: bool = False,
+                 channel_attention: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
+        mid_channels = int(out_channels * expand_ratio)
+        self.channel_attention = channel_attention
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            block(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+        if channel_attention:
+            self.attention = ChannelAttention(2 * mid_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+
+        if self.channel_attention:
+            x_final = self.attention(x_final)
+        return self.final_conv(x_final)
diff --git a/mmpose/models/utils/misc.py b/mmpose/models/utils/misc.py
new file mode 100644
index 0000000000..347c521709
--- /dev/null
+++ b/mmpose/models/utils/misc.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+from six.moves import map, zip
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def filter_scores_and_topk(scores, score_thr, topk, results=None):
+    """Filter results using score threshold and topk candidates.
+
+    Args:
+        scores (Tensor): The scores, shape (num_bboxes, K).
+        score_thr (float): The score filter threshold.
+        topk (int): The number of topk candidates.
+        results (dict or list or Tensor, Optional): The results to
+           which the filtering rule is to be applied. The shape
+           of each item is (num_bboxes, N).
+
+    Returns:
+        tuple: Filtered results
+
+            - scores (Tensor): The scores after being filtered, \
+                shape (num_bboxes_filtered, ).
+            - labels (Tensor): The class labels, shape \
+                (num_bboxes_filtered, ).
+            - anchor_idxs (Tensor): The anchor indexes, shape \
+                (num_bboxes_filtered, ).
+            - filtered_results (dict or list or Tensor, Optional): \
+                The filtered results. The shape of each item is \
+                (num_bboxes_filtered, N).
+    """
+    valid_mask = scores > score_thr
+    scores = scores[valid_mask]
+    valid_idxs = torch.nonzero(valid_mask)
+
+    num_topk = min(topk, valid_idxs.size(0))
+    # torch.sort is actually faster than .topk (at least on GPUs)
+    scores, idxs = scores.sort(descending=True)
+    scores = scores[:num_topk]
+    topk_idxs = valid_idxs[idxs[:num_topk]]
+    keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+    filtered_results = None
+    if results is not None:
+        if isinstance(results, dict):
+            filtered_results = {k: v[keep_idxs] for k, v in results.items()}
+        elif isinstance(results, list):
+            filtered_results = [result[keep_idxs] for result in results]
+        elif isinstance(results, torch.Tensor):
+            filtered_results = results[keep_idxs]
+        else:
+            raise NotImplementedError(f'Only supports dict or list or Tensor, '
+                                      f'but get {type(results)}.')
+    return scores, labels, keep_idxs, filtered_results
diff --git a/mmpose/registry.py b/mmpose/registry.py
index 3e8ab4f544..84903eaf2d 100644
--- a/mmpose/registry.py
+++ b/mmpose/registry.py
@@ -91,7 +91,7 @@
 PARAM_SCHEDULERS = Registry(
     'parameter scheduler',
     parent=MMENGINE_PARAM_SCHEDULERS,
-    locations=['mmpose.engine'])
+    locations=['mmpose.engine.schedulers'])
 
 # manage all kinds of metrics
 METRICS = Registry(
@@ -104,7 +104,9 @@
 
 # manage task-specific modules like anchor generators and box coders
 TASK_UTILS = Registry(
-    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmpose.models'])
+    'task util',
+    parent=MMENGINE_TASK_UTILS,
+    locations=['mmpose.models.task_modules'])
 
 # Registries For Visualizer and the related
 # manage visualizer
diff --git a/mmpose/structures/__init__.py b/mmpose/structures/__init__.py
index e4384af1cd..15c3e2d278 100644
--- a/mmpose/structures/__init__.py
+++ b/mmpose/structures/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .bbox import (bbox_cs2xywh, bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy,
-                   bbox_xyxy2cs, bbox_xyxy2xywh, flip_bbox,
-                   get_udp_warp_matrix, get_warp_matrix)
-from .keypoint import flip_keypoints
+from .bbox import (bbox_clip_border, bbox_corner2xyxy, bbox_cs2xywh,
+                   bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy,
+                   bbox_xyxy2corner, bbox_xyxy2cs, bbox_xyxy2xywh, flip_bbox,
+                   get_pers_warp_matrix, get_udp_warp_matrix, get_warp_matrix)
+from .keypoint import flip_keypoints, keypoint_clip_border
 from .multilevel_pixel_data import MultilevelPixelData
 from .pose_data_sample import PoseDataSample
 from .utils import merge_data_samples, revert_heatmap, split_instances
@@ -11,5 +12,7 @@
     'PoseDataSample', 'MultilevelPixelData', 'bbox_cs2xywh', 'bbox_cs2xyxy',
     'bbox_xywh2cs', 'bbox_xywh2xyxy', 'bbox_xyxy2cs', 'bbox_xyxy2xywh',
     'flip_bbox', 'get_udp_warp_matrix', 'get_warp_matrix', 'flip_keypoints',
-    'merge_data_samples', 'revert_heatmap', 'split_instances'
+    'merge_data_samples', 'revert_heatmap', 'split_instances',
+    'keypoint_clip_border', 'bbox_clip_border', 'bbox_xyxy2corner',
+    'bbox_corner2xyxy', 'get_pers_warp_matrix'
 ]
diff --git a/mmpose/structures/bbox/__init__.py b/mmpose/structures/bbox/__init__.py
index a3e723918c..abd3d5f2d9 100644
--- a/mmpose/structures/bbox/__init__.py
+++ b/mmpose/structures/bbox/__init__.py
@@ -1,10 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .transforms import (bbox_cs2xywh, bbox_cs2xyxy, bbox_xywh2cs,
-                         bbox_xywh2xyxy, bbox_xyxy2cs, bbox_xyxy2xywh,
-                         flip_bbox, get_udp_warp_matrix, get_warp_matrix)
+from .bbox_overlaps import bbox_overlaps
+from .transforms import (bbox_clip_border, bbox_corner2xyxy, bbox_cs2xywh,
+                         bbox_cs2xyxy, bbox_xywh2cs, bbox_xywh2xyxy,
+                         bbox_xyxy2corner, bbox_xyxy2cs, bbox_xyxy2xywh,
+                         flip_bbox, get_pers_warp_matrix, get_udp_warp_matrix,
+                         get_warp_matrix)
 
 __all__ = [
     'bbox_cs2xywh', 'bbox_cs2xyxy', 'bbox_xywh2cs', 'bbox_xywh2xyxy',
     'bbox_xyxy2cs', 'bbox_xyxy2xywh', 'flip_bbox', 'get_udp_warp_matrix',
-    'get_warp_matrix'
+    'get_warp_matrix', 'bbox_overlaps', 'bbox_clip_border', 'bbox_xyxy2corner',
+    'bbox_corner2xyxy', 'get_pers_warp_matrix'
 ]
diff --git a/mmpose/structures/bbox/bbox_overlaps.py b/mmpose/structures/bbox/bbox_overlaps.py
new file mode 100644
index 0000000000..682008c337
--- /dev/null
+++ b/mmpose/structures/bbox/bbox_overlaps.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def fp16_clamp(x, min_val=None, max_val=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        return x.float().clamp(min_val, max_val).half()
+    return x.clamp(min_val, max_val)
+
+
+def bbox_overlaps(bboxes1,
+                  bboxes2,
+                  mode='iou',
+                  is_aligned=False,
+                  eps=1e-6) -> torch.Tensor:
+    """Calculate overlap between two sets of bounding boxes.
+
+    Args:
+        bboxes1 (torch.Tensor): Bounding boxes of shape (..., m, 4) or empty.
+        bboxes2 (torch.Tensor): Bounding boxes of shape (..., n, 4) or empty.
+        mode (str): "iou" (intersection over union),
+                    "iof" (intersection over foreground),
+                    or "giou" (generalized intersection over union).
+                    Defaults to "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A small constant added to the denominator for
+            numerical stability. Default 1e-6.
+
+    Returns:
+        torch.Tensor: Overlap values of shape (..., m, n) if is_aligned is
+            False, else shape (..., m).
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    """
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    if bboxes1.ndim == 1:
+        bboxes1 = bboxes1.unsqueeze(0)
+    if bboxes2.ndim == 1:
+        bboxes2 = bboxes2.unsqueeze(0)
+
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])
+        wh = fp16_clamp(rb - lt, min_val=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
+        rb = torch.min(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])
+        wh = fp16_clamp(rb - lt, min_val=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps_tensor = union.new_tensor([eps])
+    union = torch.max(union, eps_tensor)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    elif mode == 'giou':
+        enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min_val=0)
+        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+        enclose_area = torch.max(enclose_area, eps_tensor)
+        gious = ious - (enclose_area - union) / enclose_area
+        return gious
diff --git a/mmpose/structures/bbox/transforms.py b/mmpose/structures/bbox/transforms.py
index 11524abc1e..7ddd821ace 100644
--- a/mmpose/structures/bbox/transforms.py
+++ b/mmpose/structures/bbox/transforms.py
@@ -63,9 +63,8 @@ def bbox_xyxy2cs(bbox: np.ndarray,
     if dim == 1:
         bbox = bbox[None, :]
 
-    x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
-    center = np.hstack([x1 + x2, y1 + y2]) * 0.5
-    scale = np.hstack([x2 - x1, y2 - y1]) * padding
+    scale = (bbox[..., 2:] - bbox[..., :2]) * padding
+    center = (bbox[..., 2:] + bbox[..., :2]) * 0.5
 
     if dim == 1:
         center = center[0]
@@ -172,6 +171,103 @@ def bbox_cs2xywh(center: np.ndarray,
     return bbox
 
 
+def bbox_xyxy2corner(bbox: np.ndarray):
+    """Convert bounding boxes from xyxy format to corner format.
+
+    Given a numpy array containing bounding boxes in the format
+    (xmin, ymin, xmax, ymax), this function converts the bounding
+    boxes to the corner format, where each box is represented by four
+    corner points (top-left, top-right, bottom-right, bottom-left).
+
+    Args:
+        bbox (numpy.ndarray): Input array of shape (N, 4) representing
+            N bounding boxes.
+
+    Returns:
+        numpy.ndarray: An array of shape (N, 4, 2) containing the corner
+            points of the bounding boxes.
+
+    Example:
+        bbox = np.array([[0, 0, 100, 50], [10, 20, 200, 150]])
+        corners = bbox_xyxy2corner(bbox)
+    """
+    dim = bbox.ndim
+    if dim == 1:
+        bbox = bbox[None]
+
+    bbox = np.tile(bbox, 2).reshape(-1, 4, 2)
+    bbox[:, 1:3, 0] = bbox[:, 0:2, 0]
+
+    if dim == 1:
+        bbox = bbox[0]
+
+    return bbox
+
+
+def bbox_corner2xyxy(bbox: np.ndarray):
+    """Convert bounding boxes from corner format to xyxy format.
+
+    Given a numpy array containing bounding boxes in the corner
+    format (four corner points for each box), this function converts
+    the bounding boxes to the (xmin, ymin, xmax, ymax) format.
+
+    Args:
+        bbox (numpy.ndarray): Input array of shape (N, 4, 2) representing
+            N bounding boxes.
+
+    Returns:
+        numpy.ndarray: An array of shape (N, 4) containing the bounding
+            boxes in xyxy format.
+
+    Example:
+        corners = np.array([[[0, 0], [100, 0], [100, 50], [0, 50]],
+            [[10, 20], [200, 20], [200, 150], [10, 150]]])
+        bbox = bbox_corner2xyxy(corners)
+    """
+    if bbox.shape[-1] == 8:
+        bbox = bbox.reshape(*bbox.shape[:-1], 4, 2)
+
+    dim = bbox.ndim
+    if dim == 2:
+        bbox = bbox[None]
+
+    bbox = np.concatenate((bbox.min(axis=1), bbox.max(axis=1)), axis=1)
+
+    if dim == 2:
+        bbox = bbox[0]
+
+    return bbox
+
+
+def bbox_clip_border(bbox: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
+    """Clip bounding box coordinates to fit within a specified shape.
+
+    Args:
+        bbox (np.ndarray): Bounding box coordinates of shape (..., 4)
+            or (..., 2).
+        shape (Tuple[int, int]): Shape of the image to which bounding
+            boxes are being clipped in the format of (w, h)
+
+    Returns:
+        np.ndarray: Clipped bounding box coordinates.
+
+    Example:
+        >>> bbox = np.array([[10, 20, 30, 40], [40, 50, 80, 90]])
+        >>> shape = (50, 50)  # Example image shape
+        >>> clipped_bbox = bbox_clip_border(bbox, shape)
+    """
+    width, height = shape[:2]
+
+    if bbox.shape[-1] == 2:
+        bbox[..., 0] = np.clip(bbox[..., 0], a_min=0, a_max=width)
+        bbox[..., 1] = np.clip(bbox[..., 1], a_min=0, a_max=height)
+    else:
+        bbox[..., ::2] = np.clip(bbox[..., ::2], a_min=0, a_max=width)
+        bbox[..., 1::2] = np.clip(bbox[..., 1::2], a_min=0, a_max=height)
+
+    return bbox
+
+
 def flip_bbox(bbox: np.ndarray,
               image_size: Tuple[int, int],
               bbox_format: str = 'xywh',
@@ -328,6 +424,61 @@ def get_warp_matrix(center: np.ndarray,
     return warp_mat
 
 
+def get_pers_warp_matrix(center: np.ndarray, translate: np.ndarray,
+                         scale: float, rot: float,
+                         shear: np.ndarray) -> np.ndarray:
+    """Compute a perspective warp matrix based on specified transformations.
+
+    Args:
+        center (np.ndarray): Center of the transformation.
+        translate (np.ndarray): Translation vector.
+        scale (float): Scaling factor.
+        rot (float): Rotation angle in degrees.
+        shear (np.ndarray): Shearing angles in degrees along x and y axes.
+
+    Returns:
+        np.ndarray: Perspective warp matrix.
+
+    Example:
+        >>> center = np.array([0, 0])
+        >>> translate = np.array([10, 20])
+        >>> scale = 1.2
+        >>> rot = 30.0
+        >>> shear = np.array([15.0, 0.0])
+        >>> warp_matrix = get_pers_warp_matrix(center, translate,
+                                               scale, rot, shear)
+    """
+    translate_mat = np.array([[1, 0, translate[0] + center[0]],
+                              [0, 1, translate[1] + center[1]], [0, 0, 1]],
+                             dtype=np.float32)
+
+    shear_x = math.radians(shear[0])
+    shear_y = math.radians(shear[1])
+    shear_mat = np.array([[1, np.tan(shear_x), 0], [np.tan(shear_y), 1, 0],
+                          [0, 0, 1]],
+                         dtype=np.float32)
+
+    rotate_angle = math.radians(rot)
+    rotate_mat = np.array([[np.cos(rotate_angle), -np.sin(rotate_angle), 0],
+                           [np.sin(rotate_angle),
+                            np.cos(rotate_angle), 0], [0, 0, 1]],
+                          dtype=np.float32)
+
+    scale_mat = np.array([[scale, 0, 0], [0, scale, 0], [0, 0, 1]],
+                         dtype=np.float32)
+
+    recover_center_mat = np.array([[1, 0, -center[0]], [0, 1, -center[1]],
+                                   [0, 0, 1]],
+                                  dtype=np.float32)
+
+    warp_matrix = np.dot(
+        np.dot(
+            np.dot(np.dot(translate_mat, shear_mat), rotate_mat), scale_mat),
+        recover_center_mat)
+
+    return warp_matrix
+
+
 def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
     """Rotate a point by an angle.
 
diff --git a/mmpose/structures/keypoint/__init__.py b/mmpose/structures/keypoint/__init__.py
index 12ee96cf7c..f4969d3283 100644
--- a/mmpose/structures/keypoint/__init__.py
+++ b/mmpose/structures/keypoint/__init__.py
@@ -1,5 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from .transforms import flip_keypoints, flip_keypoints_custom_center
+from .transforms import (flip_keypoints, flip_keypoints_custom_center,
+                         keypoint_clip_border)
 
-__all__ = ['flip_keypoints', 'flip_keypoints_custom_center']
+__all__ = [
+    'flip_keypoints', 'flip_keypoints_custom_center', 'keypoint_clip_border'
+]
diff --git a/mmpose/structures/keypoint/transforms.py b/mmpose/structures/keypoint/transforms.py
index bd7274dadf..b4a2aff925 100644
--- a/mmpose/structures/keypoint/transforms.py
+++ b/mmpose/structures/keypoint/transforms.py
@@ -121,3 +121,33 @@ def flip_keypoints_custom_center(keypoints: np.ndarray,
     # Flip horizontally
     keypoints_flipped[..., 0] = x_c * 2 - keypoints_flipped[..., 0]
     return keypoints_flipped, keypoints_visible_flipped
+
+
+def keypoint_clip_border(keypoints: np.ndarray, keypoints_visible: np.ndarray,
+                         shape: Tuple[int,
+                                      int]) -> Tuple[np.ndarray, np.ndarray]:
+    """Set the visibility values for keypoints outside the image border.
+
+    Args:
+        keypoints (np.ndarray): Input keypoints coordinates.
+        keypoints_visible (np.ndarray): Visibility values of keypoints.
+        shape (Tuple[int, int]): Shape of the image to which keypoints are
+            being clipped in the format of (w, h).
+
+    Note:
+        This function sets the visibility values of keypoints that fall outside
+            the specified frame border to zero (0.0).
+    """
+    width, height = shape[:2]
+
+    # Create a mask for keypoints outside the frame
+    outside_mask = ((keypoints[..., 0] > width) | (keypoints[..., 0] < 0) |
+                    (keypoints[..., 1] > height) | (keypoints[..., 1] < 0))
+
+    # Update visibility values for keypoints outside the frame
+    if keypoints_visible.ndim == 2:
+        keypoints_visible[outside_mask] = 0.0
+    elif keypoints_visible.ndim == 3:
+        keypoints_visible[outside_mask, 0] = 0.0
+
+    return keypoints, keypoints_visible
diff --git a/mmpose/utils/__init__.py b/mmpose/utils/__init__.py
index c48ca01cea..fb9c018ed0 100644
--- a/mmpose/utils/__init__.py
+++ b/mmpose/utils/__init__.py
@@ -2,6 +2,7 @@
 from .camera import SimpleCamera, SimpleCameraTorch
 from .collect_env import collect_env
 from .config_utils import adapt_mmdet_pipeline
+from .dist_utils import reduce_mean
 from .logger import get_root_logger
 from .setup_env import register_all_modules, setup_multi_processes
 from .timer import StopWatch
@@ -9,5 +10,5 @@
 __all__ = [
     'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes',
     'register_all_modules', 'SimpleCamera', 'SimpleCameraTorch',
-    'adapt_mmdet_pipeline'
+    'adapt_mmdet_pipeline', 'reduce_mean'
 ]
diff --git a/mmpose/utils/dist_utils.py b/mmpose/utils/dist_utils.py
new file mode 100644
index 0000000000..915f92585a
--- /dev/null
+++ b/mmpose/utils/dist_utils.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.distributed as dist
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/mmpose/utils/tensor_utils.py b/mmpose/utils/tensor_utils.py
index 1be73f8991..755e26854c 100644
--- a/mmpose/utils/tensor_utils.py
+++ b/mmpose/utils/tensor_utils.py
@@ -29,6 +29,9 @@ def to_numpy(x: Union[Tensor, Sequence[Tensor]],
     if isinstance(x, Tensor):
         arrays = x.detach().cpu().numpy()
         device = x.device
+    elif isinstance(x, np.ndarray) or is_seq_of(x, np.ndarray):
+        arrays = x
+        device = 'cpu'
     elif is_seq_of(x, Tensor):
         if unzip:
             # convert (A, B) -> [(A[0], B[0]), (A[1], B[1]), ...]
diff --git a/model-index.yml b/model-index.yml
index 52c4a1adb8..8dc3f25054 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -77,6 +77,7 @@ Import:
 - configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml
 - configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml
 - configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml
+- configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
 - configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml
 - configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
 - configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml
diff --git a/projects/yolox_pose/README.md b/projects/yolox_pose/README.md
index 264b65fe9f..35a830487c 100644
--- a/projects/yolox_pose/README.md
+++ b/projects/yolox_pose/README.md
@@ -4,16 +4,24 @@ This project implements a YOLOX-based human pose estimator, utilizing the approa
 
 <img src="https://user-images.githubusercontent.com/26127467/226655503-3cee746e-6e42-40be-82ae-6e7cae2a4c7e.jpg" alt><br>
 
+📌 For improved performance and compatibility, **consider using YOLOX-Pose which is built into MMPose**, which seamlessly integrates with MMPose's tools. To learn more about adopting YOLOX-Pose in your workflow, see the documentation: [YOLOX-Pose](/configs/body_2d_keypoint/yoloxpose/README.md).
+
 ## Usage
 
 ### Prerequisites
 
 - Python 3.7 or higher
+
 - PyTorch 1.6 or higher
+
 - [MMEngine](https://github.com/open-mmlab/mmengine) v0.6.0 or higher
+
 - [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4 or higher
+
 - [MMDetection](https://github.com/open-mmlab/mmdetection) v3.0.0rc6 or higher
-- [MMYOLO](https://github.com/open-mmlab/mmyolo) v0.5.0 or higher
+
+- [MMYOLO](https://github.com/open-mmlab/mmyolo) <span style="color:red"> **v0.5.0**</span>
+
 - [MMPose](https://github.com/open-mmlab/mmpose) v1.0.0rc1 or higher
 
 All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. **In `yolox-pose/` root directory**, run the following line to add the current directory to `PYTHONPATH`:
diff --git a/projects/yolox_pose/datasets/__init__.py b/projects/yolox_pose/datasets/__init__.py
index 69bae9de53..abf0d11d23 100644
--- a/projects/yolox_pose/datasets/__init__.py
+++ b/projects/yolox_pose/datasets/__init__.py
@@ -1,3 +1,13 @@
+import mmengine
+import mmyolo
+
+compatible_version = '0.5.0'
+if mmengine.digit_version(mmyolo.__version__)[1] > \
+        mmengine.digit_version(compatible_version)[1]:
+    print(f'This project is only compatible with mmyolo {compatible_version} '
+          f'or lower. Please install the required version via:'
+          f'pip install mmyolo=={compatible_version}')
+
 from .bbox_keypoint_structure import *  # noqa
 from .coco_dataset import *  # noqa
 from .transforms import *  # noqa
diff --git a/projects/yolox_pose/models/__init__.py b/projects/yolox_pose/models/__init__.py
index 0d4804e70a..c81450826d 100644
--- a/projects/yolox_pose/models/__init__.py
+++ b/projects/yolox_pose/models/__init__.py
@@ -1,3 +1,13 @@
+import mmengine
+import mmyolo
+
+compatible_version = '0.5.0'
+if mmengine.digit_version(mmyolo.__version__)[1] > \
+        mmengine.digit_version(compatible_version)[1]:
+    print(f'This project is only compatible with mmyolo {compatible_version} '
+          f'or lower. Please install the required version via:'
+          f'pip install mmyolo=={compatible_version}')
+
 from .assigner import *  # noqa
 from .data_preprocessor import *  # noqa
 from .oks_loss import *  # noqa
diff --git a/tests/test_codecs/test_annotation_processors.py b/tests/test_codecs/test_annotation_processors.py
new file mode 100644
index 0000000000..4b67cf4f1a
--- /dev/null
+++ b/tests/test_codecs/test_annotation_processors.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from unittest import TestCase
+
+import numpy as np
+
+from mmpose.codecs import YOLOXPoseAnnotationProcessor
+
+
+class TestYOLOXPoseAnnotationProcessor(TestCase):
+
+    def test_encode(self):
+        processor = YOLOXPoseAnnotationProcessor(expand_bbox=True)
+
+        keypoints = np.array([[[0, 1], [2, 6], [4, 5]], [[5, 6], [7, 8],
+                                                         [8, 9]]])
+        keypoints_visible = np.array([[1, 1, 0], [1, 0, 1]])
+        bbox = np.array([[0, 1, 3, 4], [1, 2, 5, 6]])
+        category_id = [1, 2]
+
+        encoded = processor.encode(keypoints, keypoints_visible, bbox,
+                                   category_id)
+
+        self.assertTrue('bbox' in encoded)
+        self.assertTrue('bbox_labels' in encoded)
+        self.assertTrue(
+            np.array_equal(encoded['bbox'],
+                           np.array([[0., 1., 3., 6.], [1., 2., 8., 9.]])))
+        self.assertTrue(
+            np.array_equal(encoded['bbox_labels'], np.array([0, 1])))
+
+    def test_decode(self):
+        # make sure the `decode` method has been defined
+        processor = YOLOXPoseAnnotationProcessor()
+        _ = processor.decode(dict())
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py
index ae00a64393..57031cdacd 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py
@@ -42,6 +42,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py
index de78264dae..1706fba739 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py
@@ -42,6 +42,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py
index 8d63925257..0525e35d02 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py
@@ -42,6 +42,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py
index d7aa46b067..2f27e06698 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py
@@ -42,6 +42,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py
index e93a524611..bdf5f3b807 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py
@@ -45,6 +45,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py
index f6431af429..2c35c4490a 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py
@@ -44,6 +44,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py
index ef3cd82dfb..8dabbaa0d5 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py
@@ -42,6 +42,7 @@ def check_data_info_keys(self,
                 keypoints=np.ndarray,
                 keypoints_visible=np.ndarray,
                 invalid_segs=list,
+                area=(list, np.ndarray),
                 id=list)
         else:
             raise ValueError(f'Invalid data_mode {data_mode}')
diff --git a/tests/test_datasets/test_transforms/test_common_transforms.py b/tests/test_datasets/test_transforms/test_common_transforms.py
index ac221c2af5..fe81b9a94c 100644
--- a/tests/test_datasets/test_transforms/test_common_transforms.py
+++ b/tests/test_datasets/test_transforms/test_common_transforms.py
@@ -4,15 +4,17 @@
 from copy import deepcopy
 from unittest import TestCase
 
+import mmcv
 import numpy as np
 from mmcv.transforms import Compose, LoadImageFromFile
 from mmengine.utils import is_list_of
 
-from mmpose.datasets.transforms import (Albumentation, GenerateTarget,
-                                        GetBBoxCenterScale,
+from mmpose.datasets.transforms import (Albumentation, FilterAnnotations,
+                                        GenerateTarget, GetBBoxCenterScale,
                                         PhotometricDistortion,
                                         RandomBBoxTransform, RandomFlip,
-                                        RandomHalfBody, TopdownAffine)
+                                        RandomHalfBody, TopdownAffine,
+                                        YOLOXHSVRandomAug)
 from mmpose.testing import get_coco_sample
 
 
@@ -600,3 +602,134 @@ def test_errors(self):
         with self.assertWarnsRegex(DeprecationWarning,
                                    '`target_type` is deprecated'):
             _ = GenerateTarget(encoder=encoder, target_type='heatmap')
+
+
+class TestFilterAnnotations(TestCase):
+
+    def setUp(self):
+        """Setup the model and optimizer which are used in every test
+        method."""
+        self.results = {
+            'img':
+            np.random.random((224, 224, 3)),
+            'img_shape': (224, 224),
+            'bbox':
+            np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]]),
+            'bbox_score':
+            np.array([0.9, 0.8, 0.7]),
+            'category_id':
+            np.array([1, 2, 3]),
+            'keypoints':
+            np.array([[15, 15, 1], [25, 25, 1], [45, 45, 1]]),
+            'keypoints_visible':
+            np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]]),
+            'area':
+            np.array([300, 600, 1200]),
+        }
+
+    def test_transform(self):
+        # Test keep_empty = True
+        transform = FilterAnnotations(
+            min_gt_bbox_wh=(50, 50),
+            keep_empty=True,
+            by_box=True,
+        )
+        results = transform(copy.deepcopy(self.results))
+        self.assertIsNone(results)
+
+        # Test keep_empty = False
+        transform = FilterAnnotations(
+            min_gt_bbox_wh=(50, 50),
+            keep_empty=False,
+        )
+        results = transform(copy.deepcopy(self.results))
+        self.assertTrue(isinstance(results, dict))
+
+        # Test filter annotations by bbox
+        transform = FilterAnnotations(min_gt_bbox_wh=(15, 15), by_box=True)
+        results = transform(copy.deepcopy(self.results))
+        print((results['bbox'] == np.array([[20, 20, 40, 40], [40, 40, 80,
+                                                               80]])).all())
+        self.assertTrue((results['bbox'] == np.array([[20, 20, 40, 40],
+                                                      [40, 40, 80,
+                                                       80]])).all())
+        self.assertTrue((results['bbox_score'] == np.array([0.8, 0.7])).all())
+        self.assertTrue((results['category_id'] == np.array([2, 3])).all())
+        self.assertTrue((results['keypoints'] == np.array([[25, 25, 1],
+                                                           [45, 45,
+                                                            1]])).all())
+        self.assertTrue(
+            (results['keypoints_visible'] == np.array([[1, 1, 1], [1, 1,
+                                                                   1]])).all())
+        self.assertTrue((results['area'] == np.array([600, 1200])).all())
+
+        # Test filter annotations by area
+        transform = FilterAnnotations(min_gt_area=1000, by_area=True)
+        results = transform(copy.deepcopy(self.results))
+        self.assertIsInstance(results, dict)
+        self.assertTrue((results['bbox'] == np.array([[40, 40, 80,
+                                                       80]])).all())
+        self.assertTrue((results['bbox_score'] == np.array([0.7])).all())
+        self.assertTrue((results['category_id'] == np.array([3])).all())
+        self.assertTrue((results['keypoints'] == np.array([[45, 45,
+                                                            1]])).all())
+        self.assertTrue(
+            (results['keypoints_visible'] == np.array([[1, 1, 1]])).all())
+        self.assertTrue((results['area'] == np.array([1200])).all())
+
+        # Test filter annotations by keypoints visibility
+        transform = FilterAnnotations(min_kpt_vis=3, by_kpt=True)
+        results = transform(copy.deepcopy(self.results))
+        self.assertIsInstance(results, dict)
+        self.assertTrue((results['bbox'] == np.array([[20, 20, 40, 40],
+                                                      [40, 40, 80,
+                                                       80]])).all())
+        self.assertTrue((results['bbox_score'] == np.array([0.8, 0.7])).all())
+        self.assertTrue((results['category_id'] == np.array([2, 3])).all())
+        self.assertTrue((results['keypoints'] == np.array([[25, 25, 1],
+                                                           [45, 45,
+                                                            1]])).all())
+        self.assertTrue(
+            (results['keypoints_visible'] == np.array([[1, 1, 1], [1, 1,
+                                                                   1]])).all())
+        self.assertTrue((results['area'] == np.array([600, 1200])).all())
+
+
+class TestYOLOXHSVRandomAug(TestCase):
+
+    def setUp(self):
+        """Setup the model and optimizer which are used in every test method.
+
+        TestCase calls functions in this order: setUp() -> testMethod() ->
+        tearDown() -> cleanUp()
+        """
+        img = mmcv.imread(
+            osp.join(
+                osp.dirname(__file__), '../../data/coco/000000000785.jpg'),
+            'color')
+        self.results = {
+            'img':
+            img,
+            'img_shape': (640, 425),
+            'category_id':
+            np.array([1, 2, 3], dtype=np.int64),
+            'bbox':
+            np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]],
+                     dtype=np.float32),
+        }
+
+    def test_transform(self):
+        transform = YOLOXHSVRandomAug()
+        results = transform(copy.deepcopy(self.results))
+        self.assertTrue(
+            results['img'].shape[:2] == self.results['img'].shape[:2])
+        self.assertTrue(
+            results['category_id'].shape[0] == results['bbox'].shape[0])
+        self.assertTrue(results['bbox'].dtype == np.float32)
+
+    def test_repr(self):
+        transform = YOLOXHSVRandomAug()
+        self.assertEqual(
+            repr(transform), ('YOLOXHSVRandomAug(hue_delta=5, '
+                              'saturation_delta=30, '
+                              'value_delta=30)'))
diff --git a/tests/test_datasets/test_transforms/test_mix_img_transform.py b/tests/test_datasets/test_transforms/test_mix_img_transform.py
new file mode 100644
index 0000000000..bae26da83a
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_mix_img_transform.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from unittest import TestCase
+
+import numpy as np
+
+from mmpose.datasets.transforms import Mosaic, YOLOXMixUp
+
+
+class TestMosaic(TestCase):
+
+    def setUp(self):
+        # Create a sample data dictionary for testing
+        sample_data = {
+            'img':
+            np.random.randint(0, 255, size=(480, 640, 3), dtype=np.uint8),
+            'bbox': np.random.rand(2, 4),
+            'bbox_score': np.random.rand(2, ),
+            'category_id': [1, 2],
+            'keypoints': np.random.rand(2, 3, 2),
+            'keypoints_visible': np.random.rand(2, 3),
+            'area': np.random.rand(2, )
+        }
+        mixed_data_list = [sample_data.copy() for _ in range(3)]
+        sample_data.update({'mixed_data_list': mixed_data_list})
+
+        self.sample_data = sample_data
+
+    def test_apply_mix(self):
+        mosaic = Mosaic()
+        transformed_data = mosaic.apply_mix(self.sample_data)
+
+        # Check if the transformed data has the expected keys
+        self.assertTrue('img' in transformed_data)
+        self.assertTrue('img_shape' in transformed_data)
+        self.assertTrue('bbox' in transformed_data)
+        self.assertTrue('category_id' in transformed_data)
+        self.assertTrue('bbox_score' in transformed_data)
+        self.assertTrue('keypoints' in transformed_data)
+        self.assertTrue('keypoints_visible' in transformed_data)
+        self.assertTrue('area' in transformed_data)
+
+    def test_create_mosaic_image(self):
+        mosaic = Mosaic()
+        mosaic_img, annos = mosaic._create_mosaic_image(
+            self.sample_data, self.sample_data['mixed_data_list'])
+
+        # Check if the mosaic image and annotations are generated correctly
+        self.assertEqual(mosaic_img.shape, (1280, 1280, 3))
+        self.assertTrue('bboxes' in annos)
+        self.assertTrue('bbox_scores' in annos)
+        self.assertTrue('category_id' in annos)
+        self.assertTrue('keypoints' in annos)
+        self.assertTrue('keypoints_visible' in annos)
+        self.assertTrue('area' in annos)
+
+    def test_mosaic_combine(self):
+        mosaic = Mosaic()
+        center = (320, 240)
+        img_shape = (480, 640)
+        paste_coord, crop_coord = mosaic._mosaic_combine(
+            'top_left', center, img_shape)
+
+        # Check if the coordinates are calculated correctly
+        self.assertEqual(paste_coord, (0, 0, 320, 240))
+        self.assertEqual(crop_coord, (160, 400, 480, 640))
+
+
+class TestYOLOXMixUp(TestCase):
+
+    def setUp(self):
+        # Create a sample data dictionary for testing
+        sample_data = {
+            'img':
+            np.random.randint(0, 255, size=(480, 640, 3), dtype=np.uint8),
+            'bbox': np.random.rand(2, 4),
+            'bbox_score': np.random.rand(2, ),
+            'category_id': [1, 2],
+            'keypoints': np.random.rand(2, 3, 2),
+            'keypoints_visible': np.random.rand(2, 3),
+            'area': np.random.rand(2, ),
+            'flip_indices': [0, 2, 1]
+        }
+        mixed_data_list = [sample_data.copy() for _ in range(1)]
+        sample_data.update({'mixed_data_list': mixed_data_list})
+
+        self.sample_data = sample_data
+
+    def test_apply_mix(self):
+        mixup = YOLOXMixUp()
+        transformed_data = mixup.apply_mix(self.sample_data)
+
+        # Check if the transformed data has the expected keys
+        self.assertTrue('img' in transformed_data)
+        self.assertTrue('img_shape' in transformed_data)
+        self.assertTrue('bbox' in transformed_data)
+        self.assertTrue('category_id' in transformed_data)
+        self.assertTrue('bbox_score' in transformed_data)
+        self.assertTrue('keypoints' in transformed_data)
+        self.assertTrue('keypoints_visible' in transformed_data)
+        self.assertTrue('area' in transformed_data)
+
+    def test_create_mixup_image(self):
+        mixup = YOLOXMixUp()
+        mixup_img, annos = mixup._create_mixup_image(
+            self.sample_data, self.sample_data['mixed_data_list'])
+
+        # Check if the mosaic image and annotations are generated correctly
+        self.assertEqual(mixup_img.shape, (480, 640, 3))
+        self.assertTrue('bboxes' in annos)
+        self.assertTrue('bbox_scores' in annos)
+        self.assertTrue('category_id' in annos)
+        self.assertTrue('keypoints' in annos)
+        self.assertTrue('keypoints_visible' in annos)
+        self.assertTrue('area' in annos)
diff --git a/tests/test_engine/test_hooks/test_mode_switch_hooks.py b/tests/test_engine/test_hooks/test_mode_switch_hooks.py
new file mode 100644
index 0000000000..fbf10bd3ef
--- /dev/null
+++ b/tests/test_engine/test_hooks/test_mode_switch_hooks.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import Mock
+
+import torch
+from mmengine.config import Config
+from mmengine.runner import Runner
+from torch.utils.data import Dataset
+
+from mmpose.engine.hooks import YOLOXPoseModeSwitchHook
+from mmpose.utils import register_all_modules
+
+
+class DummyDataset(Dataset):
+    METAINFO = dict()  # type: ignore
+    data = torch.randn(12, 2)
+    label = torch.ones(12)
+
+    @property
+    def metainfo(self):
+        return self.METAINFO
+
+    def __len__(self):
+        return self.data.size(0)
+
+    def __getitem__(self, index):
+        return dict(inputs=self.data[index], data_sample=self.label[index])
+
+
+pipeline1 = [
+    dict(type='RandomHalfBody'),
+]
+
+pipeline2 = [
+    dict(type='RandomFlip'),
+]
+register_all_modules()
+
+
+class TestYOLOXPoseModeSwitchHook(TestCase):
+
+    def test(self):
+        train_dataloader = dict(
+            dataset=DummyDataset(),
+            sampler=dict(type='DefaultSampler', shuffle=True),
+            batch_size=3,
+            num_workers=0)
+
+        runner = Mock()
+        runner.model = Mock()
+        runner.model.module = Mock()
+
+        runner.model.head.use_aux_loss = False
+        runner.cfg.train_dataloader = Config(train_dataloader)
+        runner.train_dataloader = Runner.build_dataloader(train_dataloader)
+        runner.train_dataloader.dataset.pipeline = pipeline1
+
+        hook = YOLOXPoseModeSwitchHook(
+            num_last_epochs=15, new_train_pipeline=pipeline2)
+
+        # test after change mode
+        runner.epoch = 284
+        runner.max_epochs = 300
+        hook.before_train_epoch(runner)
+        self.assertTrue(runner.model.bbox_head.use_aux_loss)
+        self.assertEqual(runner.train_loop.dataloader.dataset.pipeline,
+                         pipeline2)
diff --git a/tests/test_engine/test_hooks/test_sync_norm_hook.py b/tests/test_engine/test_hooks/test_sync_norm_hook.py
new file mode 100644
index 0000000000..f256127fa1
--- /dev/null
+++ b/tests/test_engine/test_hooks/test_sync_norm_hook.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import Mock, patch
+
+import torch.nn as nn
+
+from mmpose.engine.hooks import SyncNormHook
+
+
+class TestSyncNormHook(TestCase):
+
+    @patch(
+        'mmpose.engine.hooks.sync_norm_hook.get_dist_info',
+        return_value=(0, 1))
+    def test_before_val_epoch_non_dist(self, mock):
+        model = nn.Sequential(
+            nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3),
+            nn.Linear(5, 10))
+        runner = Mock()
+        runner.model = model
+        hook = SyncNormHook()
+        hook.before_val_epoch(runner)
+
+    @patch(
+        'mmpose.engine.hooks.sync_norm_hook.get_dist_info',
+        return_value=(0, 2))
+    def test_before_val_epoch_dist(self, mock):
+        model = nn.Sequential(
+            nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3),
+            nn.Linear(5, 10))
+        runner = Mock()
+        runner.model = model
+        hook = SyncNormHook()
+        hook.before_val_epoch(runner)
+
+    @patch(
+        'mmpose.engine.hooks.sync_norm_hook.get_dist_info',
+        return_value=(0, 2))
+    def test_before_val_epoch_dist_no_norm(self, mock):
+        model = nn.Sequential(nn.Conv2d(1, 5, kernel_size=3), nn.Linear(5, 10))
+        runner = Mock()
+        runner.model = model
+        hook = SyncNormHook()
+        hook.before_val_epoch(runner)
diff --git a/tests/test_engine/test_schedulers/test_quadratic_warmup.py b/tests/test_engine/test_schedulers/test_quadratic_warmup.py
new file mode 100644
index 0000000000..9f0650b0c2
--- /dev/null
+++ b/tests/test_engine/test_schedulers/test_quadratic_warmup.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from mmengine.optim.scheduler import _ParamScheduler
+from mmengine.testing import assert_allclose
+
+from mmpose.engine.schedulers import (QuadraticWarmupLR,
+                                      QuadraticWarmupMomentum,
+                                      QuadraticWarmupParamScheduler)
+
+
+class ToyModel(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 1, 1)
+        self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+    def forward(self, x):
+        return self.conv2(F.relu(self.conv1(x)))
+
+
+class TestQuadraticWarmupScheduler(TestCase):
+
+    def setUp(self):
+        """Setup the model and optimizer which are used in every test method.
+
+        TestCase calls functions in this order: setUp() -> testMethod() ->
+        tearDown() -> cleanUp()
+        """
+        self.model = ToyModel()
+        self.optimizer = optim.SGD(
+            self.model.parameters(), lr=0.05, momentum=0.01, weight_decay=5e-4)
+
+    def _test_scheduler_value(self,
+                              schedulers,
+                              targets,
+                              epochs=10,
+                              param_name='lr'):
+        if isinstance(schedulers, _ParamScheduler):
+            schedulers = [schedulers]
+        for epoch in range(epochs):
+            for param_group, target in zip(self.optimizer.param_groups,
+                                           targets):
+                print(param_group[param_name])
+                assert_allclose(
+                    target[epoch],
+                    param_group[param_name],
+                    msg='{} is wrong in epoch {}: expected {}, got {}'.format(
+                        param_name, epoch, target[epoch],
+                        param_group[param_name]),
+                    atol=1e-5,
+                    rtol=0)
+            [scheduler.step() for scheduler in schedulers]
+
+    def test_quadratic_warmup_scheduler(self):
+        with self.assertRaises(ValueError):
+            QuadraticWarmupParamScheduler(self.optimizer, param_name='lr')
+        epochs = 10
+        iters = 5
+        warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)]
+        single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * (
+            epochs - iters)
+        targets = [single_targets, [x * epochs for x in single_targets]]
+        scheduler = QuadraticWarmupParamScheduler(
+            self.optimizer, param_name='lr', end=iters)
+        self._test_scheduler_value(scheduler, targets, epochs)
+
+    def test_quadratic_warmup_scheduler_convert_iterbased(self):
+        epochs = 10
+        end = 5
+        epoch_length = 11
+
+        iters = end * epoch_length
+        warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)]
+        single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * (
+            epochs * epoch_length - iters)
+        targets = [single_targets, [x * epochs for x in single_targets]]
+        scheduler = QuadraticWarmupParamScheduler.build_iter_from_epoch(
+            self.optimizer,
+            param_name='lr',
+            end=end,
+            epoch_length=epoch_length)
+        self._test_scheduler_value(scheduler, targets, epochs * epoch_length)
+
+    def test_quadratic_warmup_lr(self):
+        epochs = 10
+        iters = 5
+        warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)]
+        single_targets = [x * 0.05 for x in warmup_factor] + [0.05] * (
+            epochs - iters)
+        targets = [single_targets, [x * epochs for x in single_targets]]
+        scheduler = QuadraticWarmupLR(self.optimizer, end=iters)
+        self._test_scheduler_value(scheduler, targets, epochs)
+
+    def test_quadratic_warmup_momentum(self):
+        epochs = 10
+        iters = 5
+        warmup_factor = [pow((i + 1) / float(iters), 2) for i in range(iters)]
+        single_targets = [x * 0.01 for x in warmup_factor] + [0.01] * (
+            epochs - iters)
+        targets = [single_targets, [x * epochs for x in single_targets]]
+        scheduler = QuadraticWarmupMomentum(self.optimizer, end=iters)
+        self._test_scheduler_value(
+            scheduler, targets, epochs, param_name='momentum')
diff --git a/tests/test_evaluation/test_functional/test_nms.py b/tests/test_evaluation/test_functional/test_nms.py
index b29ed86ccb..34a2533b76 100644
--- a/tests/test_evaluation/test_functional/test_nms.py
+++ b/tests/test_evaluation/test_functional/test_nms.py
@@ -2,8 +2,9 @@
 from unittest import TestCase
 
 import numpy as np
+import torch
 
-from mmpose.evaluation.functional.nms import nearby_joints_nms
+from mmpose.evaluation.functional.nms import nearby_joints_nms, nms_torch
 
 
 class TestNearbyJointsNMS(TestCase):
@@ -38,3 +39,21 @@ def test_nearby_joints_nms(self):
 
         with self.assertRaises(AssertionError):
             _ = nearby_joints_nms(kpts_db, 0.05, num_nearby_joints_thr=3)
+
+
+class TestNMSTorch(TestCase):
+
+    def test_nms_torch(self):
+        bboxes = torch.tensor([[0, 0, 3, 3], [1, 0, 3, 3], [4, 4, 6, 6]],
+                              dtype=torch.float32)
+
+        scores = torch.tensor([0.9, 0.8, 0.7])
+
+        expected_result = torch.tensor([0, 2])
+        result = nms_torch(bboxes, scores, threshold=0.5)
+        self.assertTrue(torch.equal(result, expected_result))
+
+        expected_result = [torch.tensor([0, 1]), torch.tensor([2])]
+        result = nms_torch(bboxes, scores, threshold=0.5, return_group=True)
+        for res_out, res_expected in zip(result, expected_result):
+            self.assertTrue(torch.equal(res_out, res_expected))
diff --git a/tests/test_models/test_backbones/test_csp_darknet.py b/tests/test_models/test_backbones/test_csp_darknet.py
new file mode 100644
index 0000000000..61b200b749
--- /dev/null
+++ b/tests/test_models/test_backbones/test_csp_darknet.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.models.backbones.csp_darknet import CSPDarknet
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+class TestCSPDarknetBackbone(unittest.TestCase):
+
+    def test_invalid_frozen_stages(self):
+        with self.assertRaises(ValueError):
+            CSPDarknet(frozen_stages=6)
+
+    def test_invalid_out_indices(self):
+        with self.assertRaises(AssertionError):
+            CSPDarknet(out_indices=[6])
+
+    def test_frozen_stages(self):
+        frozen_stages = 1
+        model = CSPDarknet(frozen_stages=frozen_stages)
+        model.train()
+
+        for mod in model.stem.modules():
+            for param in mod.parameters():
+                self.assertFalse(param.requires_grad)
+        for i in range(1, frozen_stages + 1):
+            layer = getattr(model, f'stage{i}')
+            for mod in layer.modules():
+                if isinstance(mod, _BatchNorm):
+                    self.assertFalse(mod.training)
+            for param in layer.parameters():
+                self.assertFalse(param.requires_grad)
+
+    def test_norm_eval(self):
+        model = CSPDarknet(norm_eval=True)
+        model.train()
+
+        self.assertFalse(check_norm_state(model.modules(), True))
+
+    def test_csp_darknet_p5_forward(self):
+        model = CSPDarknet(
+            arch='P5', widen_factor=0.25, out_indices=range(0, 5))
+        model.train()
+
+        imgs = torch.randn(1, 3, 64, 64)
+        feat = model(imgs)
+        self.assertEqual(len(feat), 5)
+        self.assertEqual(feat[0].shape, torch.Size((1, 16, 32, 32)))
+        self.assertEqual(feat[1].shape, torch.Size((1, 32, 16, 16)))
+        self.assertEqual(feat[2].shape, torch.Size((1, 64, 8, 8)))
+        self.assertEqual(feat[3].shape, torch.Size((1, 128, 4, 4)))
+        self.assertEqual(feat[4].shape, torch.Size((1, 256, 2, 2)))
+
+    def test_csp_darknet_p6_forward(self):
+        model = CSPDarknet(
+            arch='P6',
+            widen_factor=0.25,
+            out_indices=range(0, 6),
+            spp_kernal_sizes=(3, 5, 7))
+        model.train()
+
+        imgs = torch.randn(1, 3, 128, 128)
+        feat = model(imgs)
+        self.assertEqual(feat[0].shape, torch.Size((1, 16, 64, 64)))
+        self.assertEqual(feat[1].shape, torch.Size((1, 32, 32, 32)))
+        self.assertEqual(feat[2].shape, torch.Size((1, 64, 16, 16)))
+        self.assertEqual(feat[3].shape, torch.Size((1, 128, 8, 8)))
+        self.assertEqual(feat[4].shape, torch.Size((1, 192, 4, 4)))
+        self.assertEqual(feat[5].shape, torch.Size((1, 256, 2, 2)))
+
+    def test_csp_darknet_custom_arch_forward(self):
+        arch_ovewrite = [[32, 56, 3, True, False], [56, 224, 2, True, False],
+                         [224, 512, 1, True, False]]
+        model = CSPDarknet(
+            arch_ovewrite=arch_ovewrite,
+            widen_factor=0.25,
+            out_indices=(0, 1, 2, 3))
+        model.train()
+
+        imgs = torch.randn(1, 3, 32, 32)
+        feat = model(imgs)
+        self.assertEqual(len(feat), 4)
+        self.assertEqual(feat[0].shape, torch.Size((1, 8, 16, 16)))
+        self.assertEqual(feat[1].shape, torch.Size((1, 14, 8, 8)))
+        self.assertEqual(feat[2].shape, torch.Size((1, 56, 4, 4)))
+        self.assertEqual(feat[3].shape, torch.Size((1, 128, 2, 2)))
+
+    def test_csp_darknet_custom_arch_norm(self):
+        model = CSPDarknet(widen_factor=0.125, out_indices=range(0, 5))
+        for m in model.modules():
+            if is_norm(m):
+                self.assertIsInstance(m, _BatchNorm)
+        model.train()
+
+        imgs = torch.randn(1, 3, 64, 64)
+        feat = model(imgs)
+        self.assertEqual(len(feat), 5)
+        self.assertEqual(feat[0].shape, torch.Size((1, 8, 32, 32)))
+        self.assertEqual(feat[1].shape, torch.Size((1, 16, 16, 16)))
+        self.assertEqual(feat[2].shape, torch.Size((1, 32, 8, 8)))
+        self.assertEqual(feat[3].shape, torch.Size((1, 64, 4, 4)))
+        self.assertEqual(feat[4].shape, torch.Size((1, 128, 2, 2)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_models/test_data_preprocessors/test_data_preprocessor.py b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 0000000000..6c669f55a2
--- /dev/null
+++ b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+
+from mmpose.models.data_preprocessors import (BatchSyncRandomResize,
+                                              PoseDataPreprocessor)
+from mmpose.structures import PoseDataSample
+
+
+class TestPoseDataPreprocessor(TestCase):
+
+    def test_init(self):
+        # test mean is None
+        processor = PoseDataPreprocessor()
+        self.assertTrue(not hasattr(processor, 'mean'))
+        self.assertTrue(processor._enable_normalize is False)
+
+        # test mean is not None
+        processor = PoseDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+        self.assertTrue(hasattr(processor, 'mean'))
+        self.assertTrue(hasattr(processor, 'std'))
+        self.assertTrue(processor._enable_normalize)
+
+        # please specify both mean and std
+        with self.assertRaises(AssertionError):
+            PoseDataPreprocessor(mean=[0, 0, 0])
+
+        # bgr2rgb and rgb2bgr cannot be set to True at the same time
+        with self.assertRaises(AssertionError):
+            PoseDataPreprocessor(bgr_to_rgb=True, rgb_to_bgr=True)
+
+    def test_forward(self):
+        processor = PoseDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+
+        data = {
+            'inputs': [torch.randint(0, 256, (3, 11, 10))],
+            'data_samples': [PoseDataSample()]
+        }
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+
+        self.assertEqual(batch_inputs.shape, (1, 3, 11, 10))
+        self.assertEqual(len(batch_data_samples), 1)
+
+        # test channel_conversion
+        processor = PoseDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs.shape, (1, 3, 11, 10))
+        self.assertEqual(len(batch_data_samples), 1)
+
+        # test padding
+        data = {
+            'inputs': [
+                torch.randint(0, 256, (3, 10, 11)),
+                torch.randint(0, 256, (3, 9, 14))
+            ],
+            'data_samples': [PoseDataSample()] * 2
+        }
+        processor = PoseDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs.shape, (2, 3, 10, 14))
+        self.assertEqual(len(batch_data_samples), 2)
+
+        # test pad_size_divisor
+        data = {
+            'inputs': [
+                torch.randint(0, 256, (3, 10, 11)),
+                torch.randint(0, 256, (3, 9, 24))
+            ],
+            'data_samples': [PoseDataSample()] * 2
+        }
+        processor = PoseDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], pad_size_divisor=5)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs.shape, (2, 3, 10, 25))
+        self.assertEqual(len(batch_data_samples), 2)
+        for data_samples, expected_shape in zip(batch_data_samples,
+                                                [(10, 15), (10, 25)]):
+            self.assertEqual(data_samples.pad_shape, expected_shape)
+
+    def test_batch_sync_random_resize(self):
+        processor = PoseDataPreprocessor(batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(320, 320),
+                size_divisor=32,
+                interval=1)
+        ])
+        self.assertTrue(
+            isinstance(processor.batch_augments[0], BatchSyncRandomResize))
+        message_hub = MessageHub.get_instance('test_batch_sync_random_resize')
+        message_hub.update_info('iter', 0)
+        packed_inputs = {
+            'inputs': [
+                torch.randint(0, 256, (3, 128, 128)),
+                torch.randint(0, 256, (3, 128, 128))
+            ],
+            'data_samples': [PoseDataSample()] * 2
+        }
+        batch_inputs = processor(packed_inputs, training=True)['inputs']
+        self.assertEqual(batch_inputs.shape, (2, 3, 128, 128))
+
+        # resize after one iter
+        message_hub.update_info('iter', 1)
+        packed_inputs = {
+            'inputs': [
+                torch.randint(0, 256, (3, 128, 128)),
+                torch.randint(0, 256, (3, 128, 128))
+            ],
+            'data_samples':
+            [PoseDataSample(metainfo=dict(img_shape=(128, 128)))] * 2
+        }
+        batch_inputs = processor(packed_inputs, training=True)['inputs']
+        self.assertEqual(batch_inputs.shape, (2, 3, 320, 320))
+
+        packed_inputs = {
+            'inputs': [
+                torch.randint(0, 256, (3, 128, 128)),
+                torch.randint(0, 256, (3, 128, 128))
+            ],
+            'data_samples': [PoseDataSample()] * 2
+        }
+        batch_inputs = processor(packed_inputs, training=False)['inputs']
+        self.assertEqual(batch_inputs.shape, (2, 3, 128, 128))
diff --git a/tests/test_models/test_necks/test_yolox_pafpn.py b/tests/test_models/test_necks/test_yolox_pafpn.py
new file mode 100644
index 0000000000..89eae39a6c
--- /dev/null
+++ b/tests/test_models/test_necks/test_yolox_pafpn.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmpose.models.necks import YOLOXPAFPN
+
+
+class TestYOLOXPAFPN(TestCase):
+
+    def test_forward(self):
+        in_channels = [128, 256, 512]
+        out_channels = 256
+        num_csp_blocks = 3
+
+        model = YOLOXPAFPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_csp_blocks=num_csp_blocks)
+        model.train()
+
+        inputs = [
+            torch.randn(1, c, 64 // (2**i), 64 // (2**i))
+            for i, c in enumerate(in_channels)
+        ]
+        outputs = model(inputs)
+
+        self.assertEqual(len(outputs), len(in_channels))
+        for out in outputs:
+            self.assertEqual(out.shape[1], out_channels)
diff --git a/tests/test_structures/test_bbox/test_bbox_overlaps.py b/tests/test_structures/test_bbox/test_bbox_overlaps.py
new file mode 100644
index 0000000000..b3523c8af5
--- /dev/null
+++ b/tests/test_structures/test_bbox/test_bbox_overlaps.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmpose.structures.bbox import bbox_overlaps  # Import your function here
+
+
+class TestBBoxOverlaps(TestCase):
+
+    def test_bbox_overlaps_iou(self):
+        bboxes1 = torch.FloatTensor([
+            [0, 0, 10, 10],
+            [10, 10, 20, 20],
+            [32, 32, 38, 42],
+        ])
+        bboxes2 = torch.FloatTensor([
+            [0, 0, 10, 20],
+            [0, 10, 10, 19],
+            [10, 10, 20, 20],
+        ])
+        overlaps = bbox_overlaps(bboxes1, bboxes2)
+
+        expected_overlaps = torch.FloatTensor([
+            [0.5000, 0.0000, 0.0000],
+            [0.0000, 0.0000, 1.0000],
+            [0.0000, 0.0000, 0.0000],
+        ])
+
+        self.assertTrue(
+            torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4))
+
+    def test_bbox_overlaps_iof(self):
+        bboxes1 = torch.FloatTensor([
+            [0, 0, 10, 10],
+            [10, 10, 20, 20],
+            [32, 32, 38, 42],
+        ])
+        bboxes2 = torch.FloatTensor([
+            [0, 0, 10, 20],
+            [0, 10, 10, 19],
+            [10, 10, 20, 20],
+        ])
+        overlaps = bbox_overlaps(bboxes1, bboxes2, mode='iof')
+
+        expected_overlaps = torch.FloatTensor([
+            [1., 0., 0.],
+            [0., 0., 1.],
+            [0., 0., 0.],
+        ])
+
+        self.assertTrue(
+            torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4))
+
+    def test_bbox_overlaps_giou(self):
+        bboxes1 = torch.FloatTensor([
+            [0, 0, 10, 10],
+            [10, 10, 20, 20],
+            [32, 32, 38, 42],
+        ])
+        bboxes2 = torch.FloatTensor([
+            [0, 0, 10, 20],
+            [0, 10, 10, 19],
+            [10, 10, 20, 20],
+        ])
+        overlaps = bbox_overlaps(bboxes1, bboxes2, mode='giou')
+
+        expected_overlaps = torch.FloatTensor([
+            [0.5000, 0.0000, -0.5000],
+            [-0.2500, -0.0500, 1.0000],
+            [-0.8371, -0.8766, -0.8214],
+        ])
+
+        self.assertTrue(
+            torch.allclose(overlaps, expected_overlaps, rtol=1e-4, atol=1e-4))
diff --git a/tests/test_structures/test_bbox/test_bbox_transforms.py b/tests/test_structures/test_bbox/test_bbox_transforms.py
new file mode 100644
index 0000000000..b2eb3da683
--- /dev/null
+++ b/tests/test_structures/test_bbox/test_bbox_transforms.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+
+from mmpose.structures.bbox import (bbox_clip_border, bbox_corner2xyxy,
+                                    bbox_xyxy2corner, get_pers_warp_matrix)
+
+
+class TestBBoxClipBorder(TestCase):
+
+    def test_bbox_clip_border_2D(self):
+        bbox = np.array([[10, 20], [60, 80], [-5, 25], [100, 120]])
+        shape = (50, 50)  # Example image shape
+        clipped_bbox = bbox_clip_border(bbox, shape)
+
+        expected_bbox = np.array([[10, 20], [50, 50], [0, 25], [50, 50]])
+
+        self.assertTrue(np.array_equal(clipped_bbox, expected_bbox))
+
+    def test_bbox_clip_border_4D(self):
+        bbox = np.array([
+            [[10, 20, 30, 40], [40, 50, 80, 90]],
+            [[-5, 0, 30, 40], [70, 80, 120, 130]],
+        ])
+        shape = (50, 60)  # Example image shape
+        clipped_bbox = bbox_clip_border(bbox, shape)
+
+        expected_bbox = np.array([
+            [[10, 20, 30, 40], [40, 50, 50, 60]],
+            [[0, 0, 30, 40], [50, 60, 50, 60]],
+        ])
+
+        self.assertTrue(np.array_equal(clipped_bbox, expected_bbox))
+
+
+class TestBBoxXYXY2Corner(TestCase):
+
+    def test_bbox_xyxy2corner_single(self):
+        bbox = np.array([0, 0, 100, 50])
+        corners = bbox_xyxy2corner(bbox)
+
+        expected_corners = np.array([[0, 0], [0, 50], [100, 0], [100, 50]])
+
+        self.assertTrue(np.array_equal(corners, expected_corners))
+
+    def test_bbox_xyxy2corner_multiple(self):
+        bboxes = np.array([[0, 0, 100, 50], [10, 20, 200, 150]])
+        corners = bbox_xyxy2corner(bboxes)
+
+        expected_corners = np.array([[[0, 0], [0, 50], [100, 0], [100, 50]],
+                                     [[10, 20], [10, 150], [200, 20],
+                                      [200, 150]]])
+
+        self.assertTrue(np.array_equal(corners, expected_corners))
+
+
+class TestBBoxCorner2XYXY(TestCase):
+
+    def test_bbox_corner2xyxy_single(self):
+
+        corners = np.array([[0, 0], [0, 50], [100, 0], [100, 50]])
+        xyxy = bbox_corner2xyxy(corners)
+        expected_xyxy = np.array([0, 0, 100, 50])
+
+        self.assertTrue(np.array_equal(xyxy, expected_xyxy))
+
+    def test_bbox_corner2xyxy_multiple(self):
+
+        corners = np.array([[[0, 0], [0, 50], [100, 0], [100, 50]],
+                            [[10, 20], [10, 150], [200, 20], [200, 150]]])
+        xyxy = bbox_corner2xyxy(corners)
+        expected_xyxy = np.array([[0, 0, 100, 50], [10, 20, 200, 150]])
+
+        self.assertTrue(np.array_equal(xyxy, expected_xyxy))
+
+
+class TestGetPersWarpMatrix(TestCase):
+
+    def test_get_pers_warp_matrix_identity(self):
+        center = np.array([0, 0])
+        translate = np.array([0, 0])
+        scale = 1.0
+        rot = 0.0
+        shear = np.array([0.0, 0.0])
+        warp_matrix = get_pers_warp_matrix(center, translate, scale, rot,
+                                           shear)
+
+        expected_matrix = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                   dtype=np.float32)
+
+        self.assertTrue(np.array_equal(warp_matrix, expected_matrix))
+
+    def test_get_pers_warp_matrix_translation(self):
+        center = np.array([0, 0])
+        translate = np.array([10, 20])
+        scale = 1.0
+        rot = 0.0
+        shear = np.array([0.0, 0.0])
+        warp_matrix = get_pers_warp_matrix(center, translate, scale, rot,
+                                           shear)
+
+        expected_matrix = np.array([[1, 0, 10], [0, 1, 20], [0, 0, 1]],
+                                   dtype=np.float32)
+
+        self.assertTrue(np.array_equal(warp_matrix, expected_matrix))
+
+    def test_get_pers_warp_matrix_scale_rotation_shear(self):
+        center = np.array([0, 0])
+        translate = np.array([0, 0])
+        scale = 1.5
+        rot = 45.0
+        shear = np.array([15.0, 30.0])
+        warp_matrix = get_pers_warp_matrix(center, translate, scale, rot,
+                                           shear)
+
+        expected_matrix = np.array([
+            [1.3448632, -0.77645713, 0.],
+            [1.6730325, 0.44828773, 0.],
+            [0., 0., 1.],
+        ],
+                                   dtype=np.float32)
+
+        # Use np.allclose to compare floating-point arrays within a tolerance
+        self.assertTrue(
+            np.allclose(warp_matrix, expected_matrix, rtol=1e-3, atol=1e-3))
diff --git a/tests/test_structures/test_keypoint/test_keypoint_transforms.py b/tests/test_structures/test_keypoint/test_keypoint_transforms.py
new file mode 100644
index 0000000000..5384ce2b14
--- /dev/null
+++ b/tests/test_structures/test_keypoint/test_keypoint_transforms.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+
+from mmpose.structures import keypoint_clip_border
+
+
+class TestKeypointClipBorder(TestCase):
+
+    def test_keypoint_clip_border(self):
+        keypoints = np.array([[[10, 20], [30, 40], [-5, 25], [50, 60]]])
+        keypoints_visible = np.array([[1.0, 0.8, 0.5, 1.0]])
+        shape = (50, 50)  # Example frame shape
+
+        clipped_keypoints, clipped_keypoints_visible = keypoint_clip_border(
+            keypoints, keypoints_visible, shape)
+
+        # Check if keypoints outside the frame have visibility set to 0.0
+        self.assertEqual(clipped_keypoints_visible[0, 2], 0.0)
+        self.assertEqual(clipped_keypoints_visible[0, 3], 0.0)
+
+        # Check if keypoints inside the frame have unchanged visibility values
+        self.assertEqual(clipped_keypoints_visible[0, 0], 1.0)
+        self.assertEqual(clipped_keypoints_visible[0, 1], 0.8)
+
+        # Check if keypoints array shapes remain unchanged
+        self.assertEqual(keypoints.shape, clipped_keypoints.shape)
+        self.assertEqual(keypoints_visible.shape,
+                         clipped_keypoints_visible.shape)
+
+        keypoints = np.array([[[10, 20], [30, 40], [-5, 25], [50, 60]]])
+        keypoints_visible = np.array([[1.0, 0.8, 0.5, 1.0]])
+        keypoints_visible_weight = np.array([[1.0, 0.0, 1.0, 1.0]])
+        keypoints_visible = np.stack(
+            (keypoints_visible, keypoints_visible_weight), axis=-1)
+        shape = (50, 50)  # Example frame shape
+
+        clipped_keypoints, clipped_keypoints_visible = keypoint_clip_border(
+            keypoints, keypoints_visible, shape)
+
+        # Check if keypoints array shapes remain unchanged
+        self.assertEqual(keypoints.shape, clipped_keypoints.shape)
+        self.assertEqual(keypoints_visible.shape,
+                         clipped_keypoints_visible.shape)
+
+        # Check if keypoints outside the frame have visibility set to 0.0
+        self.assertEqual(clipped_keypoints_visible[0, 2, 0], 0.0)
+        self.assertEqual(clipped_keypoints_visible[0, 3, 0], 0.0)
+
+        # Check if keypoints inside the frame have unchanged visibility values
+        self.assertEqual(clipped_keypoints_visible[0, 0, 0], 1.0)
+        self.assertEqual(clipped_keypoints_visible[0, 1, 0], 0.8)
+
+        # Check if the visibility weights remain unchanged
+        self.assertSequenceEqual(clipped_keypoints_visible[..., 1].tolist(),
+                                 keypoints_visible[..., 1].tolist())

From ab39ce7505b0cc74cc76355b766987d460c07e73 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Thu, 14 Sep 2023 17:44:54 +0800
Subject: [PATCH 3/4] [Fix] Fix typo in COCOMetric(#2691)

---
 mmpose/evaluation/metrics/coco_metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmpose/evaluation/metrics/coco_metric.py b/mmpose/evaluation/metrics/coco_metric.py
index d1c7191338..8fc32dd809 100644
--- a/mmpose/evaluation/metrics/coco_metric.py
+++ b/mmpose/evaluation/metrics/coco_metric.py
@@ -526,7 +526,7 @@ def results2json(self, keypoints: Dict[int, list],
                     'score': float(img_kpt['score']),
                 }
                 if 'bbox' in img_kpt:
-                    res['bbox'] = img_kpt['bbox'].tolist(),
+                    res['bbox'] = img_kpt['bbox'].tolist()
                 result.append(res)
 
             cat_results.extend(result)

From 5530c3b3a1f044bacd4d403a9b7cccf7729539b0 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Fri, 15 Sep 2023 16:27:58 +0800
Subject: [PATCH 4/4] [Fix] Fix bug raised by changing bbox_center to
 input_center (#2693)

---
 mmpose/structures/utils.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mmpose/structures/utils.py b/mmpose/structures/utils.py
index 882cda8603..616b139c54 100644
--- a/mmpose/structures/utils.py
+++ b/mmpose/structures/utils.py
@@ -50,8 +50,7 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample:
             0].pred_fields:
         reverted_heatmaps = [
             revert_heatmap(data_sample.pred_fields.heatmaps,
-                           data_sample.gt_instances.bbox_centers,
-                           data_sample.gt_instances.bbox_scales,
+                           data_sample.input_center, data_sample.input_scale,
                            data_sample.ori_shape)
             for data_sample in data_samples
         ]
@@ -65,8 +64,7 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample:
             0].gt_fields:
         reverted_heatmaps = [
             revert_heatmap(data_sample.gt_fields.heatmaps,
-                           data_sample.gt_instances.bbox_centers,
-                           data_sample.gt_instances.bbox_scales,
+                           data_sample.input_center, data_sample.input_scale,
                            data_sample.ori_shape)
             for data_sample in data_samples
         ]
@@ -79,13 +77,13 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample:
     return merged
 
 
-def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape):
+def revert_heatmap(heatmap, input_center, input_scale, img_shape):
     """Revert predicted heatmap on the original image.
 
     Args:
         heatmap (np.ndarray or torch.tensor): predicted heatmap.
-        bbox_center (np.ndarray): bounding box center coordinate.
-        bbox_scale (np.ndarray): bounding box scale.
+        input_center (np.ndarray): bounding box center coordinate.
+        input_scale (np.ndarray): bounding box scale.
         img_shape (tuple or list): size of original image.
     """
     if torch.is_tensor(heatmap):
@@ -99,8 +97,8 @@ def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape):
     hm_h, hm_w = heatmap.shape[:2]
     img_h, img_w = img_shape
     warp_mat = get_warp_matrix(
-        bbox_center.reshape((2, )),
-        bbox_scale.reshape((2, )),
+        input_center.reshape((2, )),
+        input_scale.reshape((2, )),
         rot=0,
         output_size=(hm_w, hm_h),
         inv=True)