From 5b56752593e1bdb40ada4628584ffe75bee5183f Mon Sep 17 00:00:00 2001
From: Tau-J <674106399@qq.com>
Date: Thu, 6 Jul 2023 11:51:36 +0800
Subject: [PATCH 01/37] update

---
 projects/README.md            |  2 +-
 projects/rtmpose/README.md    | 19 ++++++++++++++-----
 projects/rtmpose/README_CN.md | 11 +++++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/projects/README.md b/projects/README.md
index a10ccad65a..eacc217388 100644
--- a/projects/README.md
+++ b/projects/README.md
@@ -42,7 +42,7 @@ We also provide some documentation listed below to help you get started:
   <img src="https://user-images.githubusercontent.com/13503330/222403836-c65ba905-4bdd-4a44-834c-ff8d5959649d.png" width="800"/>
   </div><br/>
 
-- **[:bulb:YOLOX-Pose](./yolox-pose)**: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss
+- **[:bulb:YOLOX-Pose](./yolox_pose)**: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss
 
   <div align=center>
   <img src="https://user-images.githubusercontent.com/26127467/226655503-3cee746e-6e42-40be-82ae-6e7cae2a4c7e.jpg" width="800" style="width: 800px; height: 200px; object-fit: cover"/>
diff --git a/projects/rtmpose/README.md b/projects/rtmpose/README.md
index dc5b0dbe23..41c77a0731 100644
--- a/projects/rtmpose/README.md
+++ b/projects/rtmpose/README.md
@@ -689,9 +689,10 @@ Before starting the deployment, please make sure you install MMPose and MMDeploy
 
 Depending on the deployment backend, some backends require compilation of custom operators, so please refer to the corresponding document to ensure the environment is built correctly according to your needs:
 
-- [ONNX RUNTIME SUPPORT](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/onnxruntime.html)
-- [TENSORRT SUPPORT](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/tensorrt.html)
-- [OPENVINO SUPPORT](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/openvino.html)
+- [ONNX](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/onnxruntime.html)
+- [TensorRT](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/tensorrt.html)
+- [OpenVINO](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/openvino.html)
+- [TorchScript](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/torchscript.html)
 - [More](https://github.com/open-mmlab/mmdeploy/tree/main/docs/en/05-supported-backends)
 
 ### 🛠️ Step2. Convert Model
@@ -702,12 +703,20 @@ The detailed model conversion tutorial please refer to the [MMDeploy document](h
 
 Here we take converting RTMDet-nano and RTMPose-m to ONNX/TensorRT as an example.
 
-- If you only want to use ONNX, please use:
+- ONNX
   - [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmdet/detection/detection_onnxruntime_static.py) for RTMDet.
   - [`pose-detection_simcc_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py) for RTMPose.
-- If you want to use TensorRT, please use：
+- TensorRT
   - [`detection_tensorrt_static-320x320.py`](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmdet/detection/detection_tensorrt_static-320x320.py) for RTMDet.
   - [`pose-detection_simcc_tensorrt_dynamic-256x192.py`](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py) for RTMPose.
+- More
+  |   Backend   |                                                                                Config                                                                                |
+  | :---------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+  |  ncnn-fp16  | [pose-detection_simcc_ncnn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_ncnn-fp16_static-256x192.py) |
+  |   CoreML    |    [pose-detection_simcc_coreml_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_coreml_static-256x192.py)    |
+  |  OpenVINO   |  [pose-detection_simcc_openvino_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_openvino_static-256x192.py)  |
+  |    RKNN     | [pose-detection_simcc_rknn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_rknn-fp16_static-256x192.py) |
+  | TorchScript |                    [pose-detection_torchscript.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_torchscript.py)                    |
 
 If you want to customize the settings in the deployment config for your requirements, please refer to [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/en/latest/02-how-to-run/write_config.html).
 
diff --git a/projects/rtmpose/README_CN.md b/projects/rtmpose/README_CN.md
index 30bddf9ecd..bf134ab260 100644
--- a/projects/rtmpose/README_CN.md
+++ b/projects/rtmpose/README_CN.md
@@ -683,6 +683,7 @@ python demo/topdown_demo_with_mmdet.py \
 - [ONNX](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/onnxruntime.html)
 - [TensorRT](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/tensorrt.html)
 - [OpenVINO](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/openvino.html)
+- [TorchScript](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/torchscript.html)
 - [更多](https://github.com/open-mmlab/mmdeploy/tree/main/docs/en/05-supported-backends)
 
 ### 🛠️ 模型转换
@@ -703,6 +704,16 @@ python demo/topdown_demo_with_mmdet.py \
 
   \- RTMPose：[`pose-detection_simcc_tensorrt_dynamic-256x192.py`](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py)
 
+- 更多
+
+  |   Backend   |                                                                                Config                                                                                |
+  | :---------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+  |  ncnn-fp16  | [pose-detection_simcc_ncnn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_ncnn-fp16_static-256x192.py) |
+  |   CoreML    |    [pose-detection_simcc_coreml_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_coreml_static-256x192.py)    |
+  |  OpenVINO   |  [pose-detection_simcc_openvino_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_openvino_static-256x192.py)  |
+  |    RKNN     | [pose-detection_simcc_rknn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_rknn-fp16_static-256x192.py) |
+  | TorchScript |                    [pose-detection_torchscript.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_torchscript.py)                    |
+
 如果你需要对部署配置进行修改，请参考 [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/zh_CN/latest/02-how-to-run/write_config.html).
 
 本教程中使用的文件结构如下：

From 964bca7cbd8550fdc6a8d53cb9811916eac5a5c0 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Mon, 10 Jul 2023 11:46:42 +0800
Subject: [PATCH 02/37] [Fix] Fix HRFormer log link

---
 configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md
index 87309d2e7c..ef793f06fc 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md
@@ -40,4 +40,4 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 | [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py) |  256x192   | 0.738 |      0.904      |      0.812      | 0.793 |      0.941      | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192_20220316.log.json) |
 | [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py) |  384x288   | 0.757 |      0.905      |      0.824      | 0.807 |      0.941      | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288_20220316.log.json) |
 | [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py) |  256x192   | 0.754 |      0.906      |      0.827      | 0.807 |      0.943      | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) |
-| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py) |  384x288   | 0.774 |      0.909      |      0.842      | 0.823 |      0.945      | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) |
+| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py) |  384x288   | 0.774 |      0.909      |      0.842      | 0.823 |      0.945      | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288_20220316.log.json) |

From 45835ac86f0bd8c8b130d8aa5065ac1df226763b Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Mon, 10 Jul 2023 15:56:28 +0800
Subject: [PATCH 03/37] [Feature] Add Application 'Just dance' (#2528)

---
 projects/README.md                            |   6 +
 projects/just_dance/README.md                 |  34 +
 projects/just_dance/app.py                    | 109 +++
 projects/just_dance/calculate_similarity.py   | 105 +++
 projects/just_dance/configs/_base_            |   1 +
 .../configs/rtmdet-nano_one-person.py         |   3 +
 projects/just_dance/just_dance_demo.ipynb     | 712 ++++++++++++++++++
 projects/just_dance/process_video.py          | 229 ++++++
 projects/just_dance/utils.py                  | 106 +++
 9 files changed, 1305 insertions(+)
 create mode 100644 projects/just_dance/README.md
 create mode 100644 projects/just_dance/app.py
 create mode 100644 projects/just_dance/calculate_similarity.py
 create mode 120000 projects/just_dance/configs/_base_
 create mode 100644 projects/just_dance/configs/rtmdet-nano_one-person.py
 create mode 100644 projects/just_dance/just_dance_demo.ipynb
 create mode 100644 projects/just_dance/process_video.py
 create mode 100644 projects/just_dance/utils.py

diff --git a/projects/README.md b/projects/README.md
index eacc217388..1089af8194 100644
--- a/projects/README.md
+++ b/projects/README.md
@@ -54,4 +54,10 @@ We also provide some documentation listed below to help you get started:
   <img src="https://user-images.githubusercontent.com/13503330/231416285-5467d313-0732-4ada-97e1-12be6ec69a28.png" width="800"/>
   </div><br/>
 
+- **[💃Just-Dance](./just_dance)**: Enhancing Dance scoring system for comparing dance performances in videos
+
+  <div align=center>
+  <img src="https://github.com/open-mmlab/mmpose/assets/26127467/a80978ad-c66d-4bac-bf56-1fa191716f1c" width="800" style="width: 800px; height: 200px; object-fit: cover"/>
+  </div><br/>
+
 - **What's next? Join the rank of <span style="color:blue"> *MMPose contributors* </span> by creating a new  project**!
diff --git a/projects/just_dance/README.md b/projects/just_dance/README.md
new file mode 100644
index 0000000000..1255996766
--- /dev/null
+++ b/projects/just_dance/README.md
@@ -0,0 +1,34 @@
+# Just Dance - A Simple Implementation
+
+This project presents a dance scoring system based on RTMPose. Users can compare the similarity between two dancers in different videos: one referred to as the "teacher video" and the other as the "student video."
+
+Here is an example of the output dance comparison:
+
+![output](https://github.com/open-mmlab/mmpose/assets/26127467/56d5c4d1-55d8-4222-b481-2418cc29a8d4)
+
+## Usage
+
+### Jupyter Notebook
+
+We provide a Jupyter Notebook [`just_dance_demo.ipynb`](./just_dance_demo.ipynb) that contains the complete process of dance comparison. It includes steps such as video FPS adjustment, pose estimation, snippet alignment, scoring, and the generation of the merged video.
+
+### CLI tool
+
+Users can simply run the following command to generate the comparison video:
+
+```shell
+python process_video ${TEACHER_VIDEO} ${STUDENT_VIDEO}
+```
+
+### Gradio
+
+Users can also utilize Gradio to build an application using this system. We provide the script [`app.py`](./app.py). This application supports webcam input in addition to existing videos. To build this application, please follow these two steps:
+
+1. Install Gradio
+   ```shell
+   pip install gradio
+   ```
+2. Run the script [`app.py`](./app.py)
+   ```shell
+   python app.py
+   ```
diff --git a/projects/just_dance/app.py b/projects/just_dance/app.py
new file mode 100644
index 0000000000..9b40c64fdd
--- /dev/null
+++ b/projects/just_dance/app.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import sys
+from functools import partial
+from typing import Optional
+
+project_path = os.path.join(os.path.dirname(os.path.abspath(__file__)))
+mmpose_path = project_path.split('/projects', 1)[0]
+
+os.system('python -m pip install Openmim')
+os.system('python -m mim install "mmcv>=2.0.0"')
+os.system('python -m mim install mmengine')
+os.system('python -m mim install "mmdet>=3.0.0"')
+os.system(f'python -m mim install -e {mmpose_path}')
+
+os.environ['PATH'] = f"{os.environ['PATH']}:{project_path}"
+os.environ[
+    'PYTHONPATH'] = f"{os.environ.get('PYTHONPATH', '.')}:{project_path}"
+sys.path.append(project_path)
+
+import gradio as gr  # noqa
+from mmengine.utils import mkdir_or_exist  # noqa
+from process_video import VideoProcessor  # noqa
+
+
+def process_video(
+    teacher_video: Optional[str] = None,
+    student_video: Optional[str] = None,
+):
+    print(teacher_video)
+    print(student_video)
+
+    video_processor = VideoProcessor()
+    if student_video is None and teacher_video is not None:
+        # Pre-process the teacher video when users record the student video
+        # using a webcam. This allows users to view the teacher video and
+        # follow the dance moves while recording the student video.
+        _ = video_processor.get_keypoints_from_video(teacher_video)
+        return teacher_video
+    elif teacher_video is None and student_video is not None:
+        _ = video_processor.get_keypoints_from_video(student_video)
+        return student_video
+    elif teacher_video is None and student_video is None:
+        return None
+
+    return video_processor.run(teacher_video, student_video)
+
+
+# download video resources
+mkdir_or_exist(os.path.join(project_path, 'resources'))
+os.system(
+    f'wget -O {project_path}/resources/tom.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/tom.mp4'  # noqa
+)
+os.system(
+    f'wget -O {project_path}/resources/idol_producer.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/idol_producer.mp4'  # noqa
+)
+os.system(
+    f'wget -O {project_path}/resources/tsinghua_30fps.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/tsinghua_30fps.mp4'  # noqa
+)
+
+with gr.Blocks() as demo:
+    with gr.Tab('Upload-Video'):
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown('Student Video')
+                student_video = gr.Video(type='mp4')
+                gr.Examples([
+                    os.path.join(project_path, 'resources/tom.mp4'),
+                    os.path.join(project_path, 'resources/tsinghua_30fps.mp4')
+                ], student_video)
+            with gr.Column():
+                gr.Markdown('Teacher Video')
+                teacher_video = gr.Video(type='mp4')
+                gr.Examples([
+                    os.path.join(project_path, 'resources/idol_producer.mp4')
+                ], teacher_video)
+
+        button = gr.Button('Grading', variant='primary')
+        gr.Markdown('## Display')
+        out_video = gr.Video()
+
+        button.click(
+            partial(process_video), [teacher_video, student_video], out_video)
+
+    with gr.Tab('Webcam-Video'):
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown('Student Video')
+                student_video = gr.Video(source='webcam', type='mp4')
+            with gr.Column():
+                gr.Markdown('Teacher Video')
+                teacher_video = gr.Video(type='mp4')
+                gr.Examples([
+                    os.path.join(project_path, 'resources/idol_producer.mp4')
+                ], teacher_video)
+                button_upload = gr.Button('Upload', variant='primary')
+
+        button = gr.Button('Grading', variant='primary')
+        gr.Markdown('## Display')
+        out_video = gr.Video()
+
+        button_upload.click(
+            partial(process_video), [teacher_video, student_video], out_video)
+        button.click(
+            partial(process_video), [teacher_video, student_video], out_video)
+
+gr.close_all()
+demo.queue()
+demo.launch()
diff --git a/projects/just_dance/calculate_similarity.py b/projects/just_dance/calculate_similarity.py
new file mode 100644
index 0000000000..0465dbffaa
--- /dev/null
+++ b/projects/just_dance/calculate_similarity.py
@@ -0,0 +1,105 @@
+import numpy as np
+import torch
+
+flip_indices = np.array(
+    [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15])
+valid_indices = np.array([0] + list(range(5, 17)))
+
+
+@torch.no_grad()
+def _calculate_similarity(tch_kpts: np.ndarray, stu_kpts: np.ndarray):
+
+    stu_kpts = torch.from_numpy(stu_kpts[:, None, valid_indices])
+    tch_kpts = torch.from_numpy(tch_kpts[None, :, valid_indices])
+    stu_kpts = stu_kpts.expand(stu_kpts.shape[0], tch_kpts.shape[1],
+                               stu_kpts.shape[2], 3)
+    tch_kpts = tch_kpts.expand(stu_kpts.shape[0], tch_kpts.shape[1],
+                               stu_kpts.shape[2], 3)
+
+    matrix = torch.stack((stu_kpts, tch_kpts), dim=4)
+    if torch.cuda.is_available():
+        matrix = matrix.cuda()
+    mask = torch.logical_and(matrix[:, :, :, 2, 0] > 0.3,
+                             matrix[:, :, :, 2, 1] > 0.3)
+    matrix[~mask] = 0.0
+
+    matrix_ = matrix.clone()
+    matrix_[matrix == 0] = 256
+    x_min = matrix_.narrow(3, 0, 1).min(dim=2).values
+    y_min = matrix_.narrow(3, 1, 1).min(dim=2).values
+    matrix_ = matrix.clone()
+    # matrix_[matrix == 0] = 0
+    x_max = matrix_.narrow(3, 0, 1).max(dim=2).values
+    y_max = matrix_.narrow(3, 1, 1).max(dim=2).values
+
+    matrix_ = matrix.clone()
+    matrix_[:, :, :, 0] = (matrix_[:, :, :, 0] - x_min) / (
+        x_max - x_min + 1e-4)
+    matrix_[:, :, :, 1] = (matrix_[:, :, :, 1] - y_min) / (
+        y_max - y_min + 1e-4)
+    matrix_[:, :, :, 2] = (matrix_[:, :, :, 2] > 0.3).float()
+    xy_dist = matrix_[..., :2, 0] - matrix_[..., :2, 1]
+    score = matrix_[..., 2, 0] * matrix_[..., 2, 1]
+
+    similarity = (torch.exp(-50 * xy_dist.pow(2).sum(dim=-1)) *
+                  score).sum(dim=-1) / (
+                      score.sum(dim=-1) + 1e-6)
+    num_visible_kpts = score.sum(dim=-1)
+    similarity = similarity * torch.log(
+        (1 + (num_visible_kpts - 1) * 10).clamp(min=1)) / np.log(161)
+
+    similarity[similarity.isnan()] = 0
+
+    return similarity
+
+
+@torch.no_grad()
+def calculate_similarity(tch_kpts: np.ndarray, stu_kpts: np.ndarray):
+    assert tch_kpts.shape[1] == 17
+    assert tch_kpts.shape[2] == 3
+    assert stu_kpts.shape[1] == 17
+    assert stu_kpts.shape[2] == 3
+
+    similarity1 = _calculate_similarity(tch_kpts, stu_kpts)
+
+    stu_kpts_flip = stu_kpts[:, flip_indices]
+    stu_kpts_flip[..., 0] = 191.5 - stu_kpts_flip[..., 0]
+    similarity2 = _calculate_similarity(tch_kpts, stu_kpts_flip)
+
+    similarity = torch.stack((similarity1, similarity2)).max(dim=0).values
+
+    return similarity
+
+
+@torch.no_grad()
+def select_piece_from_similarity(similarity):
+    m, n = similarity.size()
+    row_indices = torch.arange(m).view(-1, 1).expand(m, n).to(similarity)
+    col_indices = torch.arange(n).view(1, -1).expand(m, n).to(similarity)
+    diagonal_indices = similarity.size(0) - 1 - row_indices + col_indices
+    unique_diagonal_indices, inverse_indices = torch.unique(
+        diagonal_indices, return_inverse=True)
+
+    diagonal_sums_list = torch.zeros(
+        unique_diagonal_indices.size(0),
+        dtype=similarity.dtype,
+        device=similarity.device)
+    diagonal_sums_list.scatter_add_(0, inverse_indices.view(-1),
+                                    similarity.view(-1))
+    diagonal_sums_list[:min(m, n) // 4] = 0
+    diagonal_sums_list[-min(m, n) // 4:] = 0
+    index = diagonal_sums_list.argmax().item()
+
+    similarity_smooth = torch.nn.functional.max_pool2d(
+        similarity[None], (1, 11), stride=(1, 1), padding=(0, 5))[0]
+    similarity_vec = similarity_smooth.diagonal(offset=index - m +
+                                                1).cpu().numpy()
+
+    stu_start = max(0, m - 1 - index)
+    tch_start = max(0, index - m + 1)
+
+    return dict(
+        stu_start=stu_start,
+        tch_start=tch_start,
+        length=len(similarity_vec),
+        similarity=similarity_vec)
diff --git a/projects/just_dance/configs/_base_ b/projects/just_dance/configs/_base_
new file mode 120000
index 0000000000..3bd06d44a7
--- /dev/null
+++ b/projects/just_dance/configs/_base_
@@ -0,0 +1 @@
+../../../configs/_base_
\ No newline at end of file
diff --git a/projects/just_dance/configs/rtmdet-nano_one-person.py b/projects/just_dance/configs/rtmdet-nano_one-person.py
new file mode 100644
index 0000000000..a838522918
--- /dev/null
+++ b/projects/just_dance/configs/rtmdet-nano_one-person.py
@@ -0,0 +1,3 @@
+_base_ = '../../rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py'
+
+model = dict(test_cfg=dict(nms_pre=1, score_thr=0.0, max_per_img=1))
diff --git a/projects/just_dance/just_dance_demo.ipynb b/projects/just_dance/just_dance_demo.ipynb
new file mode 100644
index 0000000000..45a16e4b8c
--- /dev/null
+++ b/projects/just_dance/just_dance_demo.ipynb
@@ -0,0 +1,712 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d999c38-2087-4250-b6a4-a30cf8b44ec0",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:11:38.997916Z",
+     "iopub.status.busy": "2023-07-05T13:11:38.997587Z",
+     "iopub.status.idle": "2023-07-05T13:11:39.001928Z",
+     "shell.execute_reply": "2023-07-05T13:11:39.001429Z",
+     "shell.execute_reply.started": "2023-07-05T13:11:38.997898Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os.path as osp\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import mmcv\n",
+    "import cv2\n",
+    "from mmengine.utils import track_iter_progress"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfa9bf9b-dc2c-4803-a034-8ae8778113e0",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:42:15.884465Z",
+     "iopub.status.busy": "2023-07-05T12:42:15.884167Z",
+     "iopub.status.idle": "2023-07-05T12:42:19.774569Z",
+     "shell.execute_reply": "2023-07-05T12:42:19.774020Z",
+     "shell.execute_reply.started": "2023-07-05T12:42:15.884448Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# download example videos\n",
+    "from mmengine.utils import mkdir_or_exist\n",
+    "mkdir_or_exist('resources')\n",
+    "! wget -O resources/student_video.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/tom.mp4 \n",
+    "! wget -O resources/teacher_video.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/idol_producer.mp4 \n",
+    "# ! wget -O resources/student_video.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/tsinghua_30fps.mp4 \n",
+    "\n",
+    "student_video = 'resources/student_video.mp4'\n",
+    "teacher_video = 'resources/teacher_video.mp4'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "652b6b91-e1c0-461b-90e5-653bc35ec380",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:42:20.693931Z",
+     "iopub.status.busy": "2023-07-05T12:42:20.693353Z",
+     "iopub.status.idle": "2023-07-05T12:43:14.533985Z",
+     "shell.execute_reply": "2023-07-05T12:43:14.533431Z",
+     "shell.execute_reply.started": "2023-07-05T12:42:20.693910Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# convert the fps of videos to 30\n",
+    "from mmcv import VideoReader\n",
+    "\n",
+    "if VideoReader(student_video) != 30:\n",
+    "    # ffmpeg is required to convert the video fps\n",
+    "    # which can be installed via `sudo apt install ffmpeg` on ubuntu\n",
+    "    student_video_30fps = student_video.replace(\n",
+    "        f\".{student_video.rsplit('.', 1)[1]}\",\n",
+    "        f\"_30fps.{student_video.rsplit('.', 1)[1]}\"\n",
+    "    )\n",
+    "    !ffmpeg -i {student_video} -vf \"minterpolate='fps=30'\" {student_video_30fps}\n",
+    "    student_video = student_video_30fps\n",
+    "    \n",
+    "if VideoReader(teacher_video) != 30:\n",
+    "    teacher_video_30fps = teacher_video.replace(\n",
+    "        f\".{teacher_video.rsplit('.', 1)[1]}\",\n",
+    "        f\"_30fps.{teacher_video.rsplit('.', 1)[1]}\"\n",
+    "    )\n",
+    "    !ffmpeg -i {teacher_video} -vf \"minterpolate='fps=30'\" {teacher_video_30fps}\n",
+    "    teacher_video = teacher_video_30fps    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a4e141d-ee4a-4e06-a380-230418c9b936",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:45:01.672054Z",
+     "iopub.status.busy": "2023-07-05T12:45:01.671727Z",
+     "iopub.status.idle": "2023-07-05T12:45:02.417026Z",
+     "shell.execute_reply": "2023-07-05T12:45:02.416567Z",
+     "shell.execute_reply.started": "2023-07-05T12:45:01.672035Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# init pose estimator\n",
+    "from mmpose.apis.inferencers import Pose2DInferencer\n",
+    "pose_estimator = Pose2DInferencer(\n",
+    "    'rtmpose-t_8xb256-420e_aic-coco-256x192',\n",
+    "    det_model='configs/rtmdet-nano_one-person.py',\n",
+    "    det_weights='https://download.openmmlab.com/mmpose/v1/projects/' \n",
+    "    'rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth'\n",
+    ")\n",
+    "pose_estimator.model.test_cfg['flip_test'] = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "879ba5c0-4d2d-4cca-92d7-d4f94e04a821",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:45:05.192437Z",
+     "iopub.status.busy": "2023-07-05T12:45:05.191982Z",
+     "iopub.status.idle": "2023-07-05T12:45:05.197379Z",
+     "shell.execute_reply": "2023-07-05T12:45:05.196780Z",
+     "shell.execute_reply.started": "2023-07-05T12:45:05.192417Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "@torch.no_grad()\n",
+    "def get_keypoints_from_frame(image, pose_estimator):\n",
+    "    \"\"\"Extract keypoints from a single video frame.\"\"\"\n",
+    "\n",
+    "    det_results = pose_estimator.detector(\n",
+    "        image, return_datasample=True)['predictions']\n",
+    "    pred_instance = det_results[0].pred_instances.numpy()\n",
+    "\n",
+    "    if len(pred_instance) == 0 or pred_instance.scores[0] < 0.2:\n",
+    "        return np.zeros((1, 17, 3), dtype=np.float32)\n",
+    "\n",
+    "    data_info = dict(\n",
+    "        img=image,\n",
+    "        bbox=pred_instance.bboxes[:1],\n",
+    "        bbox_score=pred_instance.scores[:1])\n",
+    "\n",
+    "    data_info.update(pose_estimator.model.dataset_meta)\n",
+    "    data = pose_estimator.collate_fn(\n",
+    "        [pose_estimator.pipeline(data_info)])\n",
+    "\n",
+    "    # custom forward\n",
+    "    data = pose_estimator.model.data_preprocessor(data, False)\n",
+    "    feats = pose_estimator.model.extract_feat(data['inputs'])\n",
+    "    pred_instances = pose_estimator.model.head.predict(\n",
+    "        feats,\n",
+    "        data['data_samples'],\n",
+    "        test_cfg=pose_estimator.model.test_cfg)[0]\n",
+    "    keypoints = np.concatenate(\n",
+    "        (pred_instances.keypoints, pred_instances.keypoint_scores[...,\n",
+    "                                                                  None]),\n",
+    "        axis=-1)\n",
+    "\n",
+    "    return keypoints    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31e5bd4c-4c2b-4fe0-b64c-1afed67b7688",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:47:55.564788Z",
+     "iopub.status.busy": "2023-07-05T12:47:55.564450Z",
+     "iopub.status.idle": "2023-07-05T12:49:37.222662Z",
+     "shell.execute_reply": "2023-07-05T12:49:37.222028Z",
+     "shell.execute_reply.started": "2023-07-05T12:47:55.564770Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# pose estimation in two videos\n",
+    "student_poses, teacher_poses = [], []\n",
+    "for frame in VideoReader(student_video):\n",
+    "    student_poses.append(get_keypoints_from_frame(frame, pose_estimator))\n",
+    "for frame in VideoReader(teacher_video):\n",
+    "    teacher_poses.append(get_keypoints_from_frame(frame, pose_estimator))\n",
+    "    \n",
+    "student_poses = np.concatenate(student_poses)\n",
+    "teacher_poses = np.concatenate(teacher_poses)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38a8d7a5-17ed-4ce2-bb8b-d1637cb49578",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:55:09.342432Z",
+     "iopub.status.busy": "2023-07-05T12:55:09.342185Z",
+     "iopub.status.idle": "2023-07-05T12:55:09.350522Z",
+     "shell.execute_reply": "2023-07-05T12:55:09.350099Z",
+     "shell.execute_reply.started": "2023-07-05T12:55:09.342416Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "valid_indices = np.array([0] + list(range(5, 17)))\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def _calculate_similarity(tch_kpts: np.ndarray, stu_kpts: np.ndarray):\n",
+    "\n",
+    "    stu_kpts = torch.from_numpy(stu_kpts[:, None, valid_indices])\n",
+    "    tch_kpts = torch.from_numpy(tch_kpts[None, :, valid_indices])\n",
+    "    stu_kpts = stu_kpts.expand(stu_kpts.shape[0], tch_kpts.shape[1],\n",
+    "                               stu_kpts.shape[2], 3)\n",
+    "    tch_kpts = tch_kpts.expand(stu_kpts.shape[0], tch_kpts.shape[1],\n",
+    "                               stu_kpts.shape[2], 3)\n",
+    "\n",
+    "    matrix = torch.stack((stu_kpts, tch_kpts), dim=4)\n",
+    "    if torch.cuda.is_available():\n",
+    "        matrix = matrix.cuda()\n",
+    "    # only consider visible keypoints\n",
+    "    mask = torch.logical_and(matrix[:, :, :, 2, 0] > 0.3,\n",
+    "                             matrix[:, :, :, 2, 1] > 0.3)\n",
+    "    matrix[~mask] = 0.0\n",
+    "\n",
+    "    matrix_ = matrix.clone()\n",
+    "    matrix_[matrix == 0] = 256\n",
+    "    x_min = matrix_.narrow(3, 0, 1).min(dim=2).values\n",
+    "    y_min = matrix_.narrow(3, 1, 1).min(dim=2).values\n",
+    "    matrix_ = matrix.clone()\n",
+    "    x_max = matrix_.narrow(3, 0, 1).max(dim=2).values\n",
+    "    y_max = matrix_.narrow(3, 1, 1).max(dim=2).values\n",
+    "\n",
+    "    matrix_ = matrix.clone()\n",
+    "    matrix_[:, :, :, 0] = (matrix_[:, :, :, 0] - x_min) / (\n",
+    "        x_max - x_min + 1e-4)\n",
+    "    matrix_[:, :, :, 1] = (matrix_[:, :, :, 1] - y_min) / (\n",
+    "        y_max - y_min + 1e-4)\n",
+    "    matrix_[:, :, :, 2] = (matrix_[:, :, :, 2] > 0.3).float()\n",
+    "    xy_dist = matrix_[..., :2, 0] - matrix_[..., :2, 1]\n",
+    "    score = matrix_[..., 2, 0] * matrix_[..., 2, 1]\n",
+    "\n",
+    "    similarity = (torch.exp(-50 * xy_dist.pow(2).sum(dim=-1)) *\n",
+    "                  score).sum(dim=-1) / (\n",
+    "                      score.sum(dim=-1) + 1e-6)\n",
+    "    num_visible_kpts = score.sum(dim=-1)\n",
+    "    similarity = similarity * torch.log(\n",
+    "        (1 + (num_visible_kpts - 1) * 10).clamp(min=1)) / np.log(161)\n",
+    "\n",
+    "    similarity[similarity.isnan()] = 0\n",
+    "\n",
+    "    return similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "658bcf89-df06-4c73-9323-8973a49c14c3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:55:31.978675Z",
+     "iopub.status.busy": "2023-07-05T12:55:31.978219Z",
+     "iopub.status.idle": "2023-07-05T12:55:32.149624Z",
+     "shell.execute_reply": "2023-07-05T12:55:32.148568Z",
+     "shell.execute_reply.started": "2023-07-05T12:55:31.978657Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# compute similarity without flip\n",
+    "similarity1 = _calculate_similarity(teacher_poses, student_poses)\n",
+    "\n",
+    "# compute similarity with flip\n",
+    "flip_indices = np.array(\n",
+    "    [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15])\n",
+    "student_poses_flip = student_poses[:, flip_indices]\n",
+    "student_poses_flip[..., 0] = 191.5 - student_poses_flip[..., 0]\n",
+    "similarity2 = _calculate_similarity(teacher_poses, student_poses_flip)\n",
+    "\n",
+    "# select the larger similarity\n",
+    "similarity = torch.stack((similarity1, similarity2)).max(dim=0).values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f981410d-4585-47c1-98c0-6946f948487d",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": false
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:55:57.321845Z",
+     "iopub.status.busy": "2023-07-05T12:55:57.321530Z",
+     "iopub.status.idle": "2023-07-05T12:55:57.582879Z",
+     "shell.execute_reply": "2023-07-05T12:55:57.582425Z",
+     "shell.execute_reply.started": "2023-07-05T12:55:57.321826Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# visualize the similarity\n",
+    "plt.imshow(similarity.cpu().numpy())\n",
+    "\n",
+    "# there is an apparent diagonal in the figure\n",
+    "# we can select matched video snippets with this diagonal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13c189e5-fc53-46a2-9057-f0f2ffc1f46d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:58:16.913855Z",
+     "iopub.status.busy": "2023-07-05T12:58:16.913529Z",
+     "iopub.status.idle": "2023-07-05T12:58:16.919972Z",
+     "shell.execute_reply": "2023-07-05T12:58:16.919005Z",
+     "shell.execute_reply.started": "2023-07-05T12:58:16.913837Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "@torch.no_grad()\n",
+    "def select_piece_from_similarity(similarity):\n",
+    "    m, n = similarity.size()\n",
+    "    row_indices = torch.arange(m).view(-1, 1).expand(m, n).to(similarity)\n",
+    "    col_indices = torch.arange(n).view(1, -1).expand(m, n).to(similarity)\n",
+    "    diagonal_indices = similarity.size(0) - 1 - row_indices + col_indices\n",
+    "    unique_diagonal_indices, inverse_indices = torch.unique(\n",
+    "        diagonal_indices, return_inverse=True)\n",
+    "\n",
+    "    diagonal_sums_list = torch.zeros(\n",
+    "        unique_diagonal_indices.size(0),\n",
+    "        dtype=similarity.dtype,\n",
+    "        device=similarity.device)\n",
+    "    diagonal_sums_list.scatter_add_(0, inverse_indices.view(-1),\n",
+    "                                    similarity.view(-1))\n",
+    "    diagonal_sums_list[:min(m, n) // 4] = 0\n",
+    "    diagonal_sums_list[-min(m, n) // 4:] = 0\n",
+    "    index = diagonal_sums_list.argmax().item()\n",
+    "\n",
+    "    similarity_smooth = torch.nn.functional.max_pool2d(\n",
+    "        similarity[None], (1, 11), stride=(1, 1), padding=(0, 5))[0]\n",
+    "    similarity_vec = similarity_smooth.diagonal(offset=index - m +\n",
+    "                                                1).cpu().numpy()\n",
+    "\n",
+    "    stu_start = max(0, m - 1 - index)\n",
+    "    tch_start = max(0, index - m + 1)\n",
+    "\n",
+    "    return dict(\n",
+    "        stu_start=stu_start,\n",
+    "        tch_start=tch_start,\n",
+    "        length=len(similarity_vec),\n",
+    "        similarity=similarity_vec)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c0e19df-949d-471d-804d-409b3b9ddf7d",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T12:58:44.860190Z",
+     "iopub.status.busy": "2023-07-05T12:58:44.859878Z",
+     "iopub.status.idle": "2023-07-05T12:58:44.888465Z",
+     "shell.execute_reply": "2023-07-05T12:58:44.887917Z",
+     "shell.execute_reply.started": "2023-07-05T12:58:44.860173Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "matched_piece_info = select_piece_from_similarity(similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51b0a2bd-253c-4a8f-a82a-263e18a4703e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:01:19.061408Z",
+     "iopub.status.busy": "2023-07-05T13:01:19.060857Z",
+     "iopub.status.idle": "2023-07-05T13:01:19.293742Z",
+     "shell.execute_reply": "2023-07-05T13:01:19.293298Z",
+     "shell.execute_reply.started": "2023-07-05T13:01:19.061378Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "plt.imshow(similarity.cpu().numpy())\n",
+    "plt.plot((matched_piece_info['tch_start'], \n",
+    "          matched_piece_info['tch_start']+matched_piece_info['length']-1),\n",
+    "         (matched_piece_info['stu_start'],\n",
+    "          matched_piece_info['stu_start']+matched_piece_info['length']-1), 'r')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ffcde4e7-ff50-483a-b515-604c1d8f121a",
+   "metadata": {},
+   "source": [
+    "# Generate Output Video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72171a0c-ab33-45bb-b84c-b15f0816ed3a",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:11:50.063595Z",
+     "iopub.status.busy": "2023-07-05T13:11:50.063259Z",
+     "iopub.status.idle": "2023-07-05T13:11:50.070929Z",
+     "shell.execute_reply": "2023-07-05T13:11:50.070411Z",
+     "shell.execute_reply.started": "2023-07-05T13:11:50.063574Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "def resize_image_to_fixed_height(image: np.ndarray,\n",
+    "                                 fixed_height: int) -> np.ndarray:\n",
+    "    \"\"\"Resizes an input image to a specified fixed height while maintaining its\n",
+    "    aspect ratio.\n",
+    "\n",
+    "    Args:\n",
+    "        image (np.ndarray): Input image as a numpy array [H, W, C]\n",
+    "        fixed_height (int): Desired fixed height of the output image.\n",
+    "\n",
+    "    Returns:\n",
+    "        Resized image as a numpy array (fixed_height, new_width, channels).\n",
+    "    \"\"\"\n",
+    "    original_height, original_width = image.shape[:2]\n",
+    "\n",
+    "    scale_ratio = fixed_height / original_height\n",
+    "    new_width = int(original_width * scale_ratio)\n",
+    "    resized_image = cv2.resize(image, (new_width, fixed_height))\n",
+    "\n",
+    "    return resized_image\n",
+    "\n",
+    "def blend_images(img1: np.ndarray,\n",
+    "                 img2: np.ndarray,\n",
+    "                 blend_ratios: Tuple[float, float] = (1, 1)) -> np.ndarray:\n",
+    "    \"\"\"Blends two input images with specified blend ratios.\n",
+    "\n",
+    "    Args:\n",
+    "        img1 (np.ndarray): First input image as a numpy array [H, W, C].\n",
+    "        img2 (np.ndarray): Second input image as a numpy array [H, W, C]\n",
+    "        blend_ratios (tuple): A tuple of two floats representing the blend\n",
+    "            ratios for the two input images.\n",
+    "\n",
+    "    Returns:\n",
+    "        Blended image as a numpy array [H, W, C]\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def normalize_image(image: np.ndarray) -> np.ndarray:\n",
+    "        if image.dtype == np.uint8:\n",
+    "            return image.astype(np.float32) / 255.0\n",
+    "        return image\n",
+    "\n",
+    "    img1 = normalize_image(img1)\n",
+    "    img2 = normalize_image(img2)\n",
+    "\n",
+    "    blended_image = img1 * blend_ratios[0] + img2 * blend_ratios[1]\n",
+    "    blended_image = blended_image.clip(min=0, max=1)\n",
+    "    blended_image = (blended_image * 255).astype(np.uint8)\n",
+    "\n",
+    "    return blended_image\n",
+    "\n",
+    "def get_smoothed_kpt(kpts, index, sigma=5):\n",
+    "    \"\"\"Smooths keypoints using a Gaussian filter.\"\"\"\n",
+    "    assert kpts.shape[1] == 17\n",
+    "    assert kpts.shape[2] == 3\n",
+    "    assert sigma % 2 == 1\n",
+    "\n",
+    "    num_kpts = len(kpts)\n",
+    "\n",
+    "    start_idx = max(0, index - sigma // 2)\n",
+    "    end_idx = min(num_kpts, index + sigma // 2 + 1)\n",
+    "\n",
+    "    # Extract a piece of the keypoints array to apply the filter\n",
+    "    piece = kpts[start_idx:end_idx].copy()\n",
+    "    original_kpt = kpts[index]\n",
+    "\n",
+    "    # Split the piece into coordinates and scores\n",
+    "    coords, scores = piece[..., :2], piece[..., 2]\n",
+    "\n",
+    "    # Calculate the Gaussian ratio for each keypoint\n",
+    "    gaussian_ratio = np.arange(len(scores)) + start_idx - index\n",
+    "    gaussian_ratio = np.exp(-gaussian_ratio**2 / 2)\n",
+    "\n",
+    "    # Update scores using the Gaussian ratio\n",
+    "    scores *= gaussian_ratio[:, None]\n",
+    "\n",
+    "    # Compute the smoothed coordinates\n",
+    "    smoothed_coords = (coords * scores[..., None]).sum(axis=0) / (\n",
+    "        scores[..., None].sum(axis=0) + 1e-4)\n",
+    "\n",
+    "    original_kpt[..., :2] = smoothed_coords\n",
+    "\n",
+    "    return original_kpt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "609b5adc-e176-4bf9-b9a4-506f72440017",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:12:46.198835Z",
+     "iopub.status.busy": "2023-07-05T13:12:46.198268Z",
+     "iopub.status.idle": "2023-07-05T13:12:46.202273Z",
+     "shell.execute_reply": "2023-07-05T13:12:46.200881Z",
+     "shell.execute_reply.started": "2023-07-05T13:12:46.198815Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "score, last_vis_score = 0, 0\n",
+    "video_writer = None\n",
+    "output_file = 'output.mp4'\n",
+    "stu_kpts = student_poses\n",
+    "tch_kpts = teacher_poses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a264405a-5d50-49de-8637-2d1f67cb0a70",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:13:11.334760Z",
+     "iopub.status.busy": "2023-07-05T13:13:11.334433Z",
+     "iopub.status.idle": "2023-07-05T13:13:17.264181Z",
+     "shell.execute_reply": "2023-07-05T13:13:17.262931Z",
+     "shell.execute_reply.started": "2023-07-05T13:13:11.334742Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from mmengine.structures import InstanceData\n",
+    "\n",
+    "tch_video_reader = VideoReader(teacher_video)\n",
+    "stu_video_reader = VideoReader(student_video)\n",
+    "for _ in range(matched_piece_info['tch_start']):\n",
+    "    _ = next(tch_video_reader)\n",
+    "for _ in range(matched_piece_info['stu_start']):\n",
+    "    _ = next(stu_video_reader)\n",
+    "    \n",
+    "for i in track_iter_progress(range(matched_piece_info['length'])):\n",
+    "    tch_frame = mmcv.bgr2rgb(next(tch_video_reader))\n",
+    "    stu_frame = mmcv.bgr2rgb(next(stu_video_reader))\n",
+    "    tch_frame = resize_image_to_fixed_height(tch_frame, 300)\n",
+    "    stu_frame = resize_image_to_fixed_height(stu_frame, 300)\n",
+    "\n",
+    "    stu_kpt = get_smoothed_kpt(stu_kpts, matched_piece_info['stu_start'] + i,\n",
+    "                               5)\n",
+    "    tch_kpt = get_smoothed_kpt(tch_kpts, matched_piece_info['tch_start'] + i,\n",
+    "                               5)\n",
+    "\n",
+    "    # draw pose\n",
+    "    stu_kpt[..., 1] += (300 - 256)\n",
+    "    tch_kpt[..., 0] += (256 - 192)\n",
+    "    tch_kpt[..., 1] += (300 - 256)\n",
+    "    stu_inst = InstanceData(\n",
+    "        keypoints=stu_kpt[None, :, :2],\n",
+    "        keypoint_scores=stu_kpt[None, :, 2])\n",
+    "    tch_inst = InstanceData(\n",
+    "        keypoints=tch_kpt[None, :, :2],\n",
+    "        keypoint_scores=tch_kpt[None, :, 2])\n",
+    "    \n",
+    "    stu_out_img = pose_estimator.visualizer._draw_instances_kpts(\n",
+    "        np.zeros((300, 256, 3)), stu_inst)\n",
+    "    tch_out_img = pose_estimator.visualizer._draw_instances_kpts(\n",
+    "        np.zeros((300, 256, 3)), tch_inst)\n",
+    "    out_img = blend_images(\n",
+    "        stu_out_img, tch_out_img, blend_ratios=(1, 0.3))\n",
+    "\n",
+    "    # draw score\n",
+    "    score_frame = matched_piece_info['similarity'][i]\n",
+    "    score += score_frame * 1000\n",
+    "    if score - last_vis_score > 1500:\n",
+    "        last_vis_score = score\n",
+    "    pose_estimator.visualizer.set_image(out_img)\n",
+    "    pose_estimator.visualizer.draw_texts(\n",
+    "        'score: ', (60, 30),\n",
+    "        font_sizes=15,\n",
+    "        colors=(255, 255, 255),\n",
+    "        vertical_alignments='bottom')\n",
+    "    pose_estimator.visualizer.draw_texts(\n",
+    "        f'{int(last_vis_score)}', (115, 30),\n",
+    "        font_sizes=30 * max(0.4, score_frame),\n",
+    "        colors=(255, 255, 255),\n",
+    "        vertical_alignments='bottom')\n",
+    "    out_img = pose_estimator.visualizer.get_image()   \n",
+    "    \n",
+    "    # concatenate\n",
+    "    concatenated_image = np.hstack((stu_frame, out_img, tch_frame))\n",
+    "    if video_writer is None:\n",
+    "        video_writer = cv2.VideoWriter(output_file,\n",
+    "                                       cv2.VideoWriter_fourcc(*'mp4v'),\n",
+    "                                       30,\n",
+    "                                       (concatenated_image.shape[1],\n",
+    "                                        concatenated_image.shape[0]))\n",
+    "    video_writer.write(mmcv.rgb2bgr(concatenated_image))\n",
+    "\n",
+    "  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "745fdd75-6ed4-4cae-9f21-c2cd486ee918",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-05T13:13:18.704492Z",
+     "iopub.status.busy": "2023-07-05T13:13:18.704179Z",
+     "iopub.status.idle": "2023-07-05T13:13:18.714843Z",
+     "shell.execute_reply": "2023-07-05T13:13:18.713866Z",
+     "shell.execute_reply.started": "2023-07-05T13:13:18.704472Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if video_writer is not None:\n",
+    "    video_writer.release()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cb0bc99-ca19-44f1-bc0a-38e14afa980f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/projects/just_dance/process_video.py b/projects/just_dance/process_video.py
new file mode 100644
index 0000000000..7f1d48b922
--- /dev/null
+++ b/projects/just_dance/process_video.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from mmengine.utils import track_iter_progress
+
+from mmpose.apis import Pose2DInferencer
+from mmpose.datasets.datasets.utils import parse_pose_metainfo
+from mmpose.visualization import PoseLocalVisualizer
+
+try:
+    from .calculate_similarity import (calculate_similarity,
+                                       select_piece_from_similarity)
+    from .utils import (blend_images, convert_video_fps, get_smoothed_kpt,
+                        resize_image_to_fixed_height)
+except ImportError:
+    from calculate_similarity import (calculate_similarity,
+                                      select_piece_from_similarity)
+    from utils import (blend_images, convert_video_fps, get_smoothed_kpt,
+                       resize_image_to_fixed_height)
+
+det_config = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    'configs/rtmdet-nano_one-person.py')
+det_weights = 'https://download.openmmlab.com/mmpose/v1/projects/' \
+    'rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth'
+
+
+class VideoProcessor:
+    """A class to process videos for pose estimation and visualization."""
+
+    @property
+    def pose_estimator(self) -> Pose2DInferencer:
+        if not hasattr(self, '_pose_estimator'):
+            self._pose_estimator = Pose2DInferencer(
+                'rtmpose-t_8xb256-420e_aic-coco-256x192',
+                det_model=det_config,
+                det_weights=det_weights)
+            self._pose_estimator.model.test_cfg['flip_test'] = False
+        return self._pose_estimator
+
+    @property
+    def visualizer(self) -> PoseLocalVisualizer:
+        if hasattr(self, '_visualizer'):
+            return self._visualizer
+        elif hasattr(self, '_pose_estimator'):
+            return self._pose_estimator.visualizer
+
+        # init visualizer
+        self._visualizer = PoseLocalVisualizer()
+        metainfo_file = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)).rsplit(os.sep, 1)[0],
+            'configs/_base_/datasets/coco.py')
+        metainfo = parse_pose_metainfo(dict(from_file=metainfo_file))
+        self._visualizer.set_dataset_meta(metainfo)
+        return self._visualizer
+
+    @torch.no_grad()
+    def get_keypoints_from_frame(self, image: np.ndarray) -> np.ndarray:
+        """Extract keypoints from a single video frame."""
+
+        det_results = self.pose_estimator.detector(
+            image, return_datasample=True)['predictions']
+        pred_instance = det_results[0].pred_instances
+
+        if len(pred_instance) == 0:
+            return np.zeros((1, 17, 3), dtype=np.float32)
+
+        # only select the most significant person
+        data_info = dict(
+            img=image,
+            bbox=pred_instance.bboxes.cpu().numpy()[:1],
+            bbox_score=pred_instance.scores.cpu().numpy()[:1])
+
+        if data_info['bbox_score'] < 0.2:
+            return np.zeros((1, 17, 3), dtype=np.float32)
+
+        data_info.update(self.pose_estimator.model.dataset_meta)
+        data = self.pose_estimator.collate_fn(
+            [self.pose_estimator.pipeline(data_info)])
+
+        # custom forward
+        data = self.pose_estimator.model.data_preprocessor(data, False)
+        feats = self.pose_estimator.model.extract_feat(data['inputs'])
+        pred_instances = self.pose_estimator.model.head.predict(
+            feats,
+            data['data_samples'],
+            test_cfg=self.pose_estimator.model.test_cfg)[0]
+        keypoints = np.concatenate(
+            (pred_instances.keypoints, pred_instances.keypoint_scores[...,
+                                                                      None]),
+            axis=-1)
+
+        return keypoints
+
+    @torch.no_grad()
+    def get_keypoints_from_video(self, video: str) -> np.ndarray:
+        """Extract keypoints from a video."""
+
+        video_fname = video.rsplit('.', 1)[0]
+        if os.path.exists(f'{video_fname}_kpts.pth'):
+            keypoints = torch.load(f'{video_fname}_kpts.pth')
+            return keypoints
+
+        video_reader = mmcv.VideoReader(video)
+
+        if video_reader.fps != 30:
+            video_reader = mmcv.VideoReader(convert_video_fps(video))
+
+        assert video_reader.fps == 30, f'only support videos with 30 FPS, ' \
+            f'but the video {video_fname} has {video_reader.fps} fps'
+        keypoints_list = []
+        for i, frame in enumerate(video_reader):
+            keypoints = self.get_keypoints_from_frame(frame)
+            keypoints_list.append(keypoints)
+        keypoints = np.concatenate(keypoints_list)
+        torch.save(keypoints, f'{video_fname}_kpts.pth')
+        return keypoints
+
+    @torch.no_grad()
+    def run(self, tch_video: str, stu_video: str):
+        # extract human poses
+        tch_kpts = self.get_keypoints_from_video(tch_video)
+        stu_kpts = self.get_keypoints_from_video(stu_video)
+
+        # compute similarity
+        similarity = calculate_similarity(tch_kpts, stu_kpts)
+
+        # select piece
+        piece_info = select_piece_from_similarity(similarity)
+
+        # output
+        tch_name = os.path.basename(tch_video).rsplit('.', 1)[0]
+        stu_name = os.path.basename(stu_video).rsplit('.', 1)[0]
+        fname = f'{tch_name}-{stu_name}.mp4'
+        output_file = os.path.join(tempfile.mkdtemp(), fname)
+        return self.generate_output_video(tch_video, stu_video, output_file,
+                                          tch_kpts, stu_kpts, piece_info)
+
+    def generate_output_video(self, tch_video: str, stu_video: str,
+                              output_file: str, tch_kpts: np.ndarray,
+                              stu_kpts: np.ndarray, piece_info: dict) -> str:
+        """Generate an output video with keypoints overlay."""
+
+        tch_video_reader = mmcv.VideoReader(tch_video)
+        stu_video_reader = mmcv.VideoReader(stu_video)
+        for _ in range(piece_info['tch_start']):
+            _ = next(tch_video_reader)
+        for _ in range(piece_info['stu_start']):
+            _ = next(stu_video_reader)
+
+        score, last_vis_score = 0, 0
+        video_writer = None
+        for i in track_iter_progress(range(piece_info['length'])):
+            tch_frame = mmcv.bgr2rgb(next(tch_video_reader))
+            stu_frame = mmcv.bgr2rgb(next(stu_video_reader))
+            tch_frame = resize_image_to_fixed_height(tch_frame, 300)
+            stu_frame = resize_image_to_fixed_height(stu_frame, 300)
+
+            stu_kpt = get_smoothed_kpt(stu_kpts, piece_info['stu_start'] + i,
+                                       5)
+            tch_kpt = get_smoothed_kpt(tch_kpts, piece_info['tch_start'] + i,
+                                       5)
+
+            # draw pose
+            stu_kpt[..., 1] += (300 - 256)
+            tch_kpt[..., 0] += (256 - 192)
+            tch_kpt[..., 1] += (300 - 256)
+            stu_inst = InstanceData(
+                keypoints=stu_kpt[None, :, :2],
+                keypoint_scores=stu_kpt[None, :, 2])
+            tch_inst = InstanceData(
+                keypoints=tch_kpt[None, :, :2],
+                keypoint_scores=tch_kpt[None, :, 2])
+
+            stu_out_img = self.visualizer._draw_instances_kpts(
+                np.zeros((300, 256, 3)), stu_inst)
+            tch_out_img = self.visualizer._draw_instances_kpts(
+                np.zeros((300, 256, 3)), tch_inst)
+            out_img = blend_images(
+                stu_out_img, tch_out_img, blend_ratios=(1, 0.3))
+
+            # draw score
+            score_frame = piece_info['similarity'][i]
+            score += score_frame * 1000
+            if score - last_vis_score > 1500:
+                last_vis_score = score
+            self.visualizer.set_image(out_img)
+            self.visualizer.draw_texts(
+                'score: ', (60, 30),
+                font_sizes=15,
+                colors=(255, 255, 255),
+                vertical_alignments='bottom')
+            self.visualizer.draw_texts(
+                f'{int(last_vis_score)}', (115, 30),
+                font_sizes=30 * max(0.4, score_frame),
+                colors=(255, 255, 255),
+                vertical_alignments='bottom')
+            out_img = self.visualizer.get_image()
+
+            # concatenate
+            concatenated_image = np.hstack((stu_frame, out_img, tch_frame))
+            if video_writer is None:
+                video_writer = cv2.VideoWriter(output_file,
+                                               cv2.VideoWriter_fourcc(*'mp4v'),
+                                               30,
+                                               (concatenated_image.shape[1],
+                                                concatenated_image.shape[0]))
+            video_writer.write(mmcv.rgb2bgr(concatenated_image))
+
+        if video_writer is not None:
+            video_writer.release()
+        return output_file
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('teacher_video', help='Path to the Teacher Video')
+    parser.add_argument('student_video', help='Path to the Student Video')
+    args = parser.parse_args()
+
+    processor = VideoProcessor()
+    processor.run(args.teacher_video, args.student_video)
diff --git a/projects/just_dance/utils.py b/projects/just_dance/utils.py
new file mode 100644
index 0000000000..cd150bb1be
--- /dev/null
+++ b/projects/just_dance/utils.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Tuple
+
+import cv2
+import numpy as np
+
+
+def resize_image_to_fixed_height(image: np.ndarray,
+                                 fixed_height: int) -> np.ndarray:
+    """Resizes an input image to a specified fixed height while maintaining its
+    aspect ratio.
+
+    Args:
+        image (np.ndarray): Input image as a numpy array [H, W, C]
+        fixed_height (int): Desired fixed height of the output image.
+
+    Returns:
+        Resized image as a numpy array (fixed_height, new_width, channels).
+    """
+    original_height, original_width = image.shape[:2]
+
+    scale_ratio = fixed_height / original_height
+    new_width = int(original_width * scale_ratio)
+    resized_image = cv2.resize(image, (new_width, fixed_height))
+
+    return resized_image
+
+
+def blend_images(img1: np.ndarray,
+                 img2: np.ndarray,
+                 blend_ratios: Tuple[float, float] = (1, 1)) -> np.ndarray:
+    """Blends two input images with specified blend ratios.
+
+    Args:
+        img1 (np.ndarray): First input image as a numpy array [H, W, C].
+        img2 (np.ndarray): Second input image as a numpy array [H, W, C]
+        blend_ratios (tuple): A tuple of two floats representing the blend
+            ratios for the two input images.
+
+    Returns:
+        Blended image as a numpy array [H, W, C]
+    """
+
+    def normalize_image(image: np.ndarray) -> np.ndarray:
+        if image.dtype == np.uint8:
+            return image.astype(np.float32) / 255.0
+        return image
+
+    img1 = normalize_image(img1)
+    img2 = normalize_image(img2)
+
+    blended_image = img1 * blend_ratios[0] + img2 * blend_ratios[1]
+    blended_image = blended_image.clip(min=0, max=1)
+    blended_image = (blended_image * 255).astype(np.uint8)
+
+    return blended_image
+
+
+def convert_video_fps(video):
+
+    input_video = video
+    video_name, post_fix = input_video.rsplit('.', 1)
+    output_video = f'{video_name}_30fps.{post_fix}'
+    if os.path.exists(output_video):
+        return output_video
+
+    os.system(
+        f"ffmpeg -i {input_video} -vf \"minterpolate='fps=30'\" {output_video}"
+    )
+
+    return output_video
+
+
+def get_smoothed_kpt(kpts, index, sigma=5):
+    """Smooths keypoints using a Gaussian filter."""
+    assert kpts.shape[1] == 17
+    assert kpts.shape[2] == 3
+    assert sigma % 2 == 1
+
+    num_kpts = len(kpts)
+
+    start_idx = max(0, index - sigma // 2)
+    end_idx = min(num_kpts, index + sigma // 2 + 1)
+
+    # Extract a piece of the keypoints array to apply the filter
+    piece = kpts[start_idx:end_idx].copy()
+    original_kpt = kpts[index]
+
+    # Split the piece into coordinates and scores
+    coords, scores = piece[..., :2], piece[..., 2]
+
+    # Calculate the Gaussian ratio for each keypoint
+    gaussian_ratio = np.arange(len(scores)) + start_idx - index
+    gaussian_ratio = np.exp(-gaussian_ratio**2 / 2)
+
+    # Update scores using the Gaussian ratio
+    scores *= gaussian_ratio[:, None]
+
+    # Compute the smoothed coordinates
+    smoothed_coords = (coords * scores[..., None]).sum(axis=0) / (
+        scores[..., None].sum(axis=0) + 1e-4)
+
+    original_kpt[..., :2] = smoothed_coords
+
+    return original_kpt

From d458942852ac87ce5919f37998df48fb20cb0334 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Thu, 13 Jul 2023 18:00:12 +0800
Subject: [PATCH 04/37] [Docs] Add advanced tutorial of implement new model.
 (#2539)

---
 .../advanced_guides/implement_new_models.md   | 80 +++++++++++++++++-
 docs/en/guide_to_framework.md                 | 38 +++++----
 .../advanced_guides/implement_new_models.md   | 81 ++++++++++++++++++-
 docs/zh_cn/guide_to_framework.md              | 40 ++++-----
 4 files changed, 196 insertions(+), 43 deletions(-)

diff --git a/docs/en/advanced_guides/implement_new_models.md b/docs/en/advanced_guides/implement_new_models.md
index 4a10b0c3c9..8d2809421e 100644
--- a/docs/en/advanced_guides/implement_new_models.md
+++ b/docs/en/advanced_guides/implement_new_models.md
@@ -1,3 +1,81 @@
 # Implement New Models
 
-Coming soon.
+This tutorial will introduce how to implement your own models in MMPose. After summarizing, we split the need to implement new models into two categories:
+
+1. Based on the algorithm paradigm supported by MMPose, customize the modules (backbone, neck, head, codec, etc.) in the model
+2. Implement new algorithm paradigm
+
+## Basic Concepts
+
+What you want to implement is one of the above, and this section is important to you because it is the basic principle of building models in the OpenMMLab.
+
+In MMPose, all the code related to the implementation of the model structure is stored in the [models directory](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models) :
+
+```shell
+mmpose
+|----models
+     |----backbones             #
+     |----data_preprocessors    # image normalization
+     |----heads                 #
+     |----losses                # loss functions
+     |----necks                 #
+     |----pose_estimators       # algorithm paradigm
+     |----utils                 #
+```
+
+You can refer to the following flow chart to locate the module you need to implement:
+
+![image](https://github.com/open-mmlab/mmpose/assets/13503330/f4eeb99c-e2a1-4907-9d46-f110c51f0814)
+
+## Pose Estimatiors
+
+In pose estimatiors, we will define the inference process of a model, and decode the model output results in `predict()`, first transform it from `output space` to `input image space` using the [codec](./codecs.md), and then combine the meta information to transform to `original image space`.
+
+![pose_estimator_en](https://github.com/open-mmlab/mmpose/assets/13503330/48c3813e-b977-4215-b5bc-e7379cfd2bce)
+
+Currently, MMPose supports the following types of pose estimator:
+
+1. [Top-down](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/topdown.py): The input of the pose model is a cropped single target (animal, human body, human face, human hand, plant, clothes, etc.) image, and the output is the key point prediction result of the target
+2. [Bottom-up](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/bottomup.py): The input of the pose model is an image containing any number of targets, and the output is the key point prediction result of all targets in the image
+3. [Pose Lifting](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/pose_lifter.py): The input of the pose model is a 2D keypoint coordinate array, and the output is a 3D keypoint coordinate array
+
+If the model you want to implement does not belong to the above algorithm paradigm, then you need to inherit the [BasePoseEstimator](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py) class to define your own algorithm paradigm.
+
+## Backbones
+
+If you want to implement a new backbone network, you need to create a new file in the [backbones directory](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/backbones) to define it.
+
+The new backbone network needs to inherit the [BaseBackbone](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/base_backbone.py) class, and there is no difference in other aspects from inheriting `nn.Module` to create.
+
+After completing the implementation of the backbone network, you need to use `MODELS` to register it:
+
+```Python3
+from mmpose.registry import MODELS
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class YourNewBackbone(BaseBackbone):
+```
+
+Finally, please remember to import your new backbone network in `[__init__.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py)` .
+
+## Heads
+
+The addition of a new prediction head is similar to the backbone network process. You need to create a new file in the [heads directory](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/heads) to define it, and then inherit [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py) .
+
+One thing to note is that in MMPose, the loss function is calculated in the Head. According to the different training and evaluation stages, `loss()` and `predict()` are executed respectively.
+
+In `predict()`, the model will call the `decode()` method of the corresponding codec to transform the model output result from `output space` to `input image space`.
+
+After completing the implementation of the prediction head, you need to use `MODELS` to register it:
+
+```Python3
+from mmpose.registry import MODELS
+from ..base_head import BaseHead
+
+@MODELS.register_module()
+class YourNewHead(BaseHead):
+```
+
+Finally, please remember to import your new prediction head in `[__init__.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py)` .
diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
index 1bfe7d3b59..bb1efed08f 100644
--- a/docs/en/guide_to_framework.md
+++ b/docs/en/guide_to_framework.md
@@ -69,9 +69,9 @@ The organization of data in MMPose contains:
 
 ### Dataset Meta Information
 
-The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under `$MMPOSE/configs/_base_/datasets/`.
+The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under [$MMPOSE/configs/_base_/datasets](https://github.com/open-mmlab/mmpose/tree/main/configs/_base_/datasets).
 
-To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset (`$MMPOSE/configs/_base_/datasets/mpii.py`) as an example. Here is its dataset information:
+To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset ([$MMPOSE/configs/_base_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)) as an example. Here is its dataset information:
 
 ```Python
 dataset_info = dict(
@@ -111,7 +111,7 @@ dataset_info = dict(
     ])
 ```
 
-In the model config, the user needs to specify the metainfo path of the custom dataset (e.g. `$MMPOSE/configs/_base_/datasets/custom.py`) as follows:\`\`\`
+In the model config, the user needs to specify the metainfo path of the custom dataset (e.g. `$MMPOSE/configs/_base_/datasets/custom.py`) as follows:
 
 ```python
 # dataset and dataloader settings
@@ -148,17 +148,15 @@ test_dataloader = val_dataloader
 
 To use custom dataset in MMPose, we recommend converting the annotations into a supported format (e.g. COCO or MPII) and directly using our implementation of the corresponding dataset. If this is not applicable, you may need to implement your own dataset class.
 
-Most 2D keypoint datasets in MMPose **organize the annotations in a COCO-like style**. Thus we provide a base class [BaseCocoStyleDataset](mmpose/datasets/datasets/base/base_coco_style_dataset.py) for these datasets. We recommend that users subclass `BaseCocoStyleDataset` and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 2D keypoint dataset.
+Most 2D keypoint datasets in MMPose **organize the annotations in a COCO-like style**. Thus we provide a base class [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) for these datasets. We recommend that users subclass [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 2D keypoint dataset.
 
 ```{note}
 Please refer to [COCO](./dataset_zoo/2d_body_keypoint.md) for more details about the COCO data format.
 ```
 
-```{note}
-The bbox format in MMPose is in `xyxy` instead of `xywh`, which is consistent with the format used in other OpenMMLab projects like [MMDetection](https://github.com/open-mmlab/mmdetection).  We provide useful utils for bbox format conversion, such as `bbox_xyxy2xywh`, `bbox_xywh2xyxy`, `bbox_xyxy2cs`, etc., which are defined in `$MMPOSE/mmpose/structures/bbox/transforms.py`.
-```
+The bbox format in MMPose is in `xyxy` instead of `xywh`, which is consistent with the format used in other OpenMMLab projects like [MMDetection](https://github.com/open-mmlab/mmdetection).  We provide useful utils for bbox format conversion, such as `bbox_xyxy2xywh`, `bbox_xywh2xyxy`, `bbox_xyxy2cs`, etc., which are defined in [$MMPOSE/mmpose/structures/bbox/transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/structures/bbox/transforms.py).
 
-Let's take the implementation of the MPII dataset (`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`) as an example.
+Let's take the implementation of the MPII dataset ([$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/mpii_dataset.py)) as an example.
 
 ```Python
 @DATASETS.register_module()
@@ -264,7 +262,7 @@ class MpiiDataset(BaseCocoStyleDataset):
 
 When supporting MPII dataset, since we need to use `head_size` to calculate `PCKh`, we add `headbox_file` to `__init__()` and override`_load_annotations()`.
 
-To support a dataset that is beyond the scope of `BaseCocoStyleDataset`, you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details.
+To support a dataset that is beyond the scope of [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py), you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details.
 
 ### Pipeline
 
@@ -302,13 +300,13 @@ Here is a diagram to show the workflow of data transformation among the three sc
 
 ![migration-en](https://user-images.githubusercontent.com/13503330/187190213-cad87b5f-0a95-4f1f-b722-15896914ded4.png)
 
-In MMPose, the modules used for data transformation are under `$MMPOSE/mmpose/datasets/transforms`, and their workflow is shown as follows:
+In MMPose, the modules used for data transformation are under `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)`, and their workflow is shown as follows:
 
 ![transforms-en](https://user-images.githubusercontent.com/13503330/187190352-a7662346-b8da-4256-9192-c7a84b15cbb5.png)
 
 #### i. Augmentation
 
-Commonly used transforms are defined in `$MMPOSE/mmpose/datasets/transforms/common_transforms.py`, such as `RandomFlip`, `RandomHalfBody`, etc.
+Commonly used transforms are defined in [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py), such as `RandomFlip`, `RandomHalfBody`, etc.
 
 For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by `RandomBBoxTransform`**.** For bottom-up methods, `BottomupRandomAffine` is used.
 
@@ -352,7 +350,7 @@ Note that we unify the data format of top-down and bottom-up methods, which mean
 
 - Bottom-up: `[B, N, K, D]`
 
-The provided codecs are stored under `$MMPOSE/mmpose/codecs`.
+The provided codecs are stored under [$MMPOSE/mmpose/codecs](https://github.com/open-mmlab/mmpose/tree/main/mmpose/codecs).
 
 ```{note}
 If you wish to customize a new codec, you can refer to [Codec](./user_guides/codecs.md) for more details.
@@ -360,7 +358,7 @@ If you wish to customize a new codec, you can refer to [Codec](./user_guides/cod
 
 #### iv. Packing
 
-After the data is transformed, you need to pack it using `PackPoseInputs`.
+After the data is transformed, you need to pack it using [PackPoseInputs](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py).
 
 This method converts the data stored in the dictionary `results` into standard data structures in MMPose, such as `InstanceData`, `PixelData`, `PoseDataSample`, etc.
 
@@ -425,7 +423,7 @@ In MMPose 1.0, the model consists of the following components:
 
 - **Head**: used to implement the core algorithm and loss function
 
-We define a base class `BasePoseEstimator` for the model in `$MMPOSE/models/pose_estimators/base.py`. All models, e.g. `TopdownPoseEstimator`, should inherit from this base class and override the corresponding methods.
+We define a base class `BasePoseEstimator` for the model in [$MMPOSE/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py). All models, e.g. `TopdownPoseEstimator`, should inherit from this base class and override the corresponding methods.
 
 Three modes are provided in `forward()` of the estimator:
 
@@ -477,7 +475,7 @@ It will transpose the channel order of the input image from `bgr` to `rgb` and n
 
 ### Backbone
 
-MMPose provides some commonly used backbones under `$MMPOSE/mmpose/models/backbones`.
+MMPose provides some commonly used backbones under [$MMPOSE/mmpose/models/backbones](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/backbones).
 
 In practice, developers often use pre-trained backbone weights for transfer learning, which can improve the performance of the model on small datasets.
 
@@ -515,7 +513,7 @@ It should be emphasized that if you add a new backbone, you need to register it
 class YourBackbone(BaseBackbone):
 ```
 
-Besides, import it in `$MMPOSE/mmpose/models/backbones/__init__.py`, and add it to `__all__`.
+Besides, import it in [$MMPOSE/mmpose/models/backbones/\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py), and add it to `__all__`.
 
 ### Neck
 
@@ -559,7 +557,7 @@ Neck is usually a module between Backbone and Head, which is used in some algori
 
 Generally speaking, Head is often the core of an algorithm, which is used to make predictions and perform loss calculation.
 
-Modules related to Head in MMPose are defined under `$MMPOSE/mmpose/models/heads`, and developers need to inherit the base class `BaseHead` when customizing Head and override the following methods:
+Modules related to Head in MMPose are defined under [$MMPOSE/mmpose/models/heads](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/heads), and developers need to inherit the base class `BaseHead` when customizing Head and override the following methods:
 
 - forward()
 
@@ -567,13 +565,13 @@ Modules related to Head in MMPose are defined under `$MMPOSE/mmpose/models/heads
 
 - loss()
 
-Specifically, `predict()` method needs to return pose predictions in the image space, which is obtained from the model output though the decoding function provided by the codec. We implement this process in `BaseHead.decode()`.
+Specifically, `predict()` method needs to return pose predictions in the image space, which is obtained from the model output though the decoding function provided by the codec. We implement this process in [BaseHead.decode()](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py).
 
 On the other hand, we will perform test-time augmentation(TTA) in `predict()`.
 
 A commonly used TTA is `flip_test`, namely, an image and its flipped version are sent into the model to inference, and the output of the flipped version will be flipped back, then average them to stabilize the prediction.
 
-Here is an example of `predict()` in `RegressionHead`:
+Here is an example of `predict()` in [RegressionHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/regression_heads/regression_head.py):
 
 ```Python
 def predict(self,
@@ -627,7 +625,7 @@ keypoint_weights = torch.cat([
 ])
 ```
 
-Here is the complete implementation of `loss()` in `RegressionHead`:
+Here is the complete implementation of `loss()` in [RegressionHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/regression_heads/regression_head.py):
 
 ```Python
 def loss(self,
diff --git a/docs/zh_cn/advanced_guides/implement_new_models.md b/docs/zh_cn/advanced_guides/implement_new_models.md
index 4a10b0c3c9..e233850277 100644
--- a/docs/zh_cn/advanced_guides/implement_new_models.md
+++ b/docs/zh_cn/advanced_guides/implement_new_models.md
@@ -1,3 +1,80 @@
-# Implement New Models
+# 实现新模型
 
-Coming soon.
+本教程将介绍如何在 MMPose 中实现你自己的模型。我们经过总结，将实现新模型这一需求拆分为两类：
+
+1. 基于 MMPose 中已支持的算法范式，对模型中的模块（骨干网络、颈部、预测头、编解码器等）进行自定义
+2. 实现新的算法范式
+
+## 基础知识
+
+不论你想实现的模型是以上哪一种，这一节的内容都对你很重要，因为它是 OpenMMLab 系列算法库构建模型的基本原则。
+在 MMPose 中，所有与模型结构实现相关的代码都存放在 [models 目录](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models)下：
+
+```shell
+mmpose
+|----models
+     |----backbones             # 骨干网络
+     |----data_preprocessors    # 数据预处理，如：图片归一化
+     |----heads                 # 预测头
+     |----losses                # 损失函数
+     |----necks                 # 颈部
+     |----pose_estimators       # 姿态估计算法范式
+     |----utils                 # 工具方法
+```
+
+你可以参考以下流程图来定位你所需要实现的模块：
+
+![image](https://github.com/open-mmlab/mmpose/assets/13503330/f4eeb99c-e2a1-4907-9d46-f110c51f0814)
+
+## 姿态估计算法范式
+
+在姿态估计范式中，我们会定义一个模型的推理流程，并在 `predict()` 中对模型输出结果进行解码，先将其从 `输出尺度空间` 用 [编解码器](./codecs.md) 变换到 `输入图片空间`，然后再结合元信息变换到 `原始图片空间`。
+
+![image](https://github.com/open-mmlab/mmpose/assets/13503330/e3e700ac-a047-4cff-9017-67f83676b8cb)
+
+当前 MMPose 已支持以下几类算法范式：
+
+1. [Top-down](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/topdown.py)：Pose 模型的输入为经过裁剪的单个目标（动物、人体、人脸、人手、植物、衣服等）图片，输出为这个目标的关键点预测结果
+2. [Bottom-up](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/bottomup.py)：Pose 模型的输入为包含任意个目标的图片，输出为图片中所有目标的关键点预测结果
+3. [Pose Lifting](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/pose_lifter.py)：Pose 模型的输入为 2D 关键点坐标数组，输出为 3D 关键点坐标数组
+
+如果你要实现的模型不属于以上算法范式，那么你需要继承 [BasePoseEstimator](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py) 类来定义你自己的算法范式。
+
+## 骨干网络
+
+如果希望实现一个新的骨干网络，你需要在 [backbones 目录](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/backbones) 下新建一个文件进行定义。
+
+新建的骨干网络需要继承 [BaseBackbone](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/base_backbone.py) 类，其他方面与你继承 nn.Module 来创建没有任何不同。
+
+在完成骨干网络的实现后，你需要使用 `MODELS` 来对其进行注册：
+
+```Python3
+from mmpose.registry import MODELS
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class YourNewBackbone(BaseBackbone):
+```
+
+最后，请记得在 [backbones/\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py) 中导入你的新骨干网络。
+
+## 预测头部
+
+新的预测头部的加入与骨干网络流程类似，你需要在 [heads 目录](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/heads) 下新建一个文件进行定义，然后继承 [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py)。
+
+需要特别注意的一点是，在 MMPose 中会在 Head 里进行损失函数的计算。根据训练与评测阶段的不同，分别执行 `loss()` 和 `predict()`。
+
+在 `predict()` 中，模型会调用对应编解码器的 `decode()` 方法，将模型输出的结果从 `输出尺度空间` 转换到 `输入图片空间` 。
+
+在完成预测头部的实现后，你需要使用 `MODELS` 来对其进行注册：
+
+```Python3
+from mmpose.registry import MODELS
+from ..base_head import BaseHead
+
+@MODELS.register_module()
+class YourNewHead(BaseHead):
+```
+
+最后，请记得在 [heads/\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py) 中导入你的新预测头部。
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index 349abf2358..132233fd5d 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -71,9 +71,9 @@ MMPose 数据的组织主要包含三个方面：
 
 ### 数据集元信息
 
-元信息指具体标注之外的数据集信息。姿态估计数据集的元信息通常包括：关键点和骨骼连接的定义、对称性、关键点性质（如关键点权重、标注标准差、所属上下半身）等。这些信息在数据在数据处理、模型训练和测试中有重要作用。在 MMPose 中，数据集的元信息使用 python 格式的配置文件保存，位于 `$MMPOSE/configs/_base_/datasets` 目录下。
+元信息指具体标注之外的数据集信息。姿态估计数据集的元信息通常包括：关键点和骨骼连接的定义、对称性、关键点性质（如关键点权重、标注标准差、所属上下半身）等。这些信息在数据在数据处理、模型训练和测试中有重要作用。在 MMPose 中，数据集的元信息使用 python 格式的配置文件保存，位于 [$MMPOSE/configs/_base_/datasets](https://github.com/open-mmlab/mmpose/tree/main/configs/_base_/datasets) 目录下。
 
-在 MMPose 中使用自定义数据集时，你需要增加对应的元信息配置文件。以 MPII 数据集（`$MMPOSE/configs/_base_/datasets/mpii.py`）为例：
+在 MMPose 中使用自定义数据集时，你需要增加对应的元信息配置文件。以 MPII 数据集（[$MMPOSE/configs/_base_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)）为例：
 
 ```Python
 dataset_info = dict(
@@ -147,17 +147,15 @@ test_dataloader = val_dataloader
 
 在 MMPose 中使用自定义数据集时，我们推荐将数据转化为已支持的格式（如 COCO 或 MPII），并直接使用我们提供的对应数据集实现。如果这种方式不可行，则用户需要实现自己的数据集类。
 
-MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**，为此我们提供了基类 [BaseCocoStyleDataset](/mmpose/datasets/datasets/base/base_coco_style_dataset.py)。我们推荐用户继承该基类，并按需重写它的方法（通常是 `__init__()` 和 `_load_annotations()` 方法），以扩展到新的 2D 关键点数据集。
+MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**，为此我们提供了基类 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py)。我们推荐用户继承该基类，并按需重写它的方法（通常是 `__init__()` 和 `_load_annotations()` 方法），以扩展到新的 2D 关键点数据集。
 
 ```{note}
 关于COCO数据格式的详细说明请参考 [COCO](./dataset_zoo/2d_body_keypoint.md) 。
 ```
 
-```{note}
-在 MMPose 中 bbox 的数据格式采用 `xyxy`，而不是 `xywh`，这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换，我们提供了丰富的函数：`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在`$MMPOSE/mmpose/structures/bbox/transforms.py`。
-```
+在 MMPose 中 bbox 的数据格式采用 `xyxy`，而不是 `xywh`，这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换，我们提供了丰富的函数：`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在 [$MMPOSE/mmpose/structures/bbox/transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/structures/bbox/transforms.py)。
 
-下面我们以MPII数据集的实现（`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`）为例：
+下面我们以MPII数据集的实现（[$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/mpii_dataset.py)）为例：
 
 ```Python
 @DATASETS.register_module()
@@ -260,9 +258,9 @@ class MpiiDataset(BaseCocoStyleDataset):
         return data_list
 ```
 
-在对MPII数据集进行支持时，由于MPII需要读入 `head_size` 信息来计算 `PCKh`，因此我们在`__init__()`中增加了 `headbox_file`，并重载了 `_load_annotations()` 来完成数据组织。
+在对MPII数据集进行支持时，由于MPII需要读入 `head_size` 信息来计算 `PCKh`，因此我们在 `__init__()` 中增加了 `headbox_file`，并重载了 `_load_annotations()` 来完成数据组织。
 
-如果自定义数据集无法被 `BaseCocoStyleDataset` 支持，你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。
+如果自定义数据集无法被 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) 支持，你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。
 
 ### 数据流水线
 
@@ -300,13 +298,13 @@ test_pipeline = [
 
 ![migration-cn](https://user-images.githubusercontent.com/13503330/187831574-13804daf-f498-47c2-ba43-64b8e6ffe3dd.png)
 
-在MMPose中，数据变换所需要的模块在`$MMPOSE/mmpose/datasets/transforms`目录下，它们的工作流程如图所示：
+在MMPose中，数据变换所需要的模块在 `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)` 目录下，它们的工作流程如图所示：
 
 ![transforms-cn](https://user-images.githubusercontent.com/13503330/187831611-8db89e20-95c7-42bc-8b0d-700fadf60328.png)
 
 #### i. 数据增强
 
-数据增强中常用的变换存放在 `$MMPOSE/mmpose/datasets/transforms/common_transforms.py` 中，如 `RandomFlip`、`RandomHalfBody` 等。
+数据增强中常用的变换存放在 [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py) 中，如 `RandomFlip`、`RandomHalfBody` 等。
 
 对于 top-down 方法，`Shift`、`Rotate`、`Resize` 操作由 `RandomBBoxTransform`来实现；对于 bottom-up 方法，这些则是由 `BottomupRandomAffine` 实现。
 
@@ -372,11 +370,11 @@ class GenerateTarget(BaseTransform):
 
 - Bottom-up: `[B, N, K, D]`
 
-当前已经支持的编解码器定义在 `$MMPOSE/mmpose/codecs` 目录下，如果你需要自定新的编解码器，可以前往[编解码器](./user_guides/codecs.md)了解更多详情。
+当前已经支持的编解码器定义在 [$MMPOSE/mmpose/codecs](https://github.com/open-mmlab/mmpose/tree/main/mmpose/codecs) 目录下，如果你需要自定新的编解码器，可以前往[编解码器](./user_guides/codecs.md)了解更多详情。
 
 #### iv. 数据打包
 
-数据经过前处理变换后，最终需要通过 `PackPoseInputs` 打包成数据样本。该操作定义在 `$MMPOSE/mmpose/datasets/transforms/formatting.py` 中。
+数据经过前处理变换后，最终需要通过 `PackPoseInputs` 打包成数据样本。该操作定义在 [$MMPOSE/mmpose/datasets/transforms/formatting.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py) 中。
 
 打包过程会将数据流水线中用字典 `results` 存储的数据转换成用 MMPose 所需的标准数据结构， 如 `InstanceData`，`PixelData`，`PoseDataSample` 等。
 
@@ -443,7 +441,7 @@ def get_pose_data_sample(self):
 
 - **预测头（Head）**：用于实现核心算法功能和损失函数定义
 
-我们在 `$MMPOSE/models/pose_estimators/base.py` 下为姿态估计模型定义了一个基类 `BasePoseEstimator`，所有的模型（如 `TopdownPoseEstimator`）都需要继承这个基类，并重载对应的方法。
+我们在 [$MMPOSE/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py) 下为姿态估计模型定义了一个基类 `BasePoseEstimator`，所有的模型（如 `TopdownPoseEstimator`）都需要继承这个基类，并重载对应的方法。
 
 在模型的 `forward()` 方法中提供了三种不同的模式：
 
@@ -495,7 +493,7 @@ data_preprocessor=dict(
 
 ### 主干网络（Backbone）
 
-MMPose 实现的主干网络存放在 `$MMPOSE/mmpose/models/backbones` 目录下。
+MMPose 实现的主干网络存放在 [$MMPOSE/mmpose/models/backbones](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/backbones) 目录下。
 
 在实际开发中，开发者经常会使用预训练的网络权重进行迁移学习，这能有效提升模型在小数据集上的性能。 在 MMPose 中，只需要在配置文件 `backbone` 的 `init_cfg` 中设置：
 
@@ -531,10 +529,12 @@ init_cfg=dict(
 class YourBackbone(BaseBackbone):
 ```
 
-同时在 `$MMPOSE/mmpose/models/backbones/__init__.py` 下进行 `import`，并加入到 `__all__` 中，才能被配置文件正确地调用。
+同时在 [$MMPOSE/mmpose/models/backbones/\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py) 下进行 `import`，并加入到 `__all__` 中，才能被配置文件正确地调用。
 
 ### 颈部模块（Neck）
 
+MMPose 中 Neck 相关的模块定义在 [$MMPOSE/mmpose/models/necks](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/necks) 目录下.
+
 颈部模块通常是介于主干网络和预测头之间的模块，在部分模型算法中会用到，常见的颈部模块有：
 
 - Global Average Pooling (GAP)
@@ -575,7 +575,7 @@ class YourBackbone(BaseBackbone):
 
 通常来说，预测头是模型算法实现的核心，用于控制模型的输出，并进行损失函数计算。
 
-MMPose 中 Head 相关的模块定义在 `$MMPOSE/mmpose/models/heads` 目录下，开发者在自定义预测头时需要继承我们提供的基类 `BaseHead`，并重载以下三个方法对应模型推理的三种模式：
+MMPose 中 Head 相关的模块定义在 [$MMPOSE/mmpose/models/heads](https://github.com/open-mmlab/mmpose/tree/main/mmpose/models/heads) 目录下，开发者在自定义预测头时需要继承我们提供的基类 [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py)，并重载以下三个方法对应模型推理的三种模式：
 
 - forward()
 
@@ -583,11 +583,11 @@ MMPose 中 Head 相关的模块定义在 `$MMPOSE/mmpose/models/heads` 目录下
 
 - loss()
 
-具体而言，`predict()` 返回的应是输入图片尺度下的结果，因此需要调用 `self.decode()` 对网络输出进行解码，这一过程实现在 `BaseHead` 中已经实现，它会调用编解码器提供的 `decode()` 方法来完成解码。
+具体而言，`predict()` 返回的应是输入图片尺度下的结果，因此需要调用 `self.decode()` 对网络输出进行解码，这一过程实现在 [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py) 中已经实现，它会调用编解码器提供的 `decode()` 方法来完成解码。
 
 另一方面，我们会在 `predict()` 中进行测试时增强。在进行预测时，一个常见的测试时增强技巧是进行翻转集成。即，将一张图片先进行一次推理，再将图片水平翻转进行一次推理，推理的结果再次水平翻转回去，对两次推理的结果进行平均。这个技巧能有效提升模型的预测稳定性。
 
-下面是在 `RegressionHead` 中定义 `predict()` 的例子：
+下面是在 [RegressionHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/regression_heads/regression_head.py) 中定义 `predict()` 的例子：
 
 ```Python
 def predict(self,
@@ -641,7 +641,7 @@ keypoint_weights = torch.cat([
 ])
 ```
 
-以下为 `RegressionHead` 中完整的 `loss()` 实现：
+以下为 [RegressionHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/regression_heads/regression_head.py) 中完整的 `loss()` 实现：
 
 ```Python
 def loss(self,

From 6a23e2c23425a8cce39271110bb2b46156c4f2ec Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Fri, 14 Jul 2023 11:08:27 +0800
Subject: [PATCH 05/37] [Doc] Update img (#2541)

---
 docs/en/advanced_guides/codecs.md                  | 2 +-
 docs/en/advanced_guides/implement_new_models.md    | 2 +-
 docs/en/guide_to_framework.md                      | 2 +-
 docs/zh_cn/advanced_guides/codecs.md               | 2 +-
 docs/zh_cn/advanced_guides/implement_new_models.md | 2 +-
 docs/zh_cn/guide_to_framework.md                   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/en/advanced_guides/codecs.md b/docs/en/advanced_guides/codecs.md
index 610bd83a57..7c98ba31d9 100644
--- a/docs/en/advanced_guides/codecs.md
+++ b/docs/en/advanced_guides/codecs.md
@@ -8,7 +8,7 @@ MMPose 1.0 introduced a new module **Codec** to integrate the encoding and decod
 
 Here is a diagram to show where the `Codec` is:
 
-![codec-en](https://user-images.githubusercontent.com/13503330/187112635-c01f13d1-a07e-420f-be50-3b8818524dec.png)
+![pose_estimator_en](https://github.com/open-mmlab/mmpose/assets/13503330/0764baab-41c7-4a1d-ab64-5d7f9dfc8eec)
 
 A typical codec consists of two parts:
 
diff --git a/docs/en/advanced_guides/implement_new_models.md b/docs/en/advanced_guides/implement_new_models.md
index 8d2809421e..da46a99e39 100644
--- a/docs/en/advanced_guides/implement_new_models.md
+++ b/docs/en/advanced_guides/implement_new_models.md
@@ -31,7 +31,7 @@ You can refer to the following flow chart to locate the module you need to imple
 
 In pose estimatiors, we will define the inference process of a model, and decode the model output results in `predict()`, first transform it from `output space` to `input image space` using the [codec](./codecs.md), and then combine the meta information to transform to `original image space`.
 
-![pose_estimator_en](https://github.com/open-mmlab/mmpose/assets/13503330/48c3813e-b977-4215-b5bc-e7379cfd2bce)
+![pose_estimator_en](https://github.com/open-mmlab/mmpose/assets/13503330/0764baab-41c7-4a1d-ab64-5d7f9dfc8eec)
 
 Currently, MMPose supports the following types of pose estimator:
 
diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
index bb1efed08f..fceb37a142 100644
--- a/docs/en/guide_to_framework.md
+++ b/docs/en/guide_to_framework.md
@@ -298,7 +298,7 @@ In a keypoint detection task, data will be transformed among three scale spaces:
 
 Here is a diagram to show the workflow of data transformation among the three scale spaces:
 
-![migration-en](https://user-images.githubusercontent.com/13503330/187190213-cad87b5f-0a95-4f1f-b722-15896914ded4.png)
+![tour_en](https://github.com/open-mmlab/mmpose/assets/13503330/e82710e6-4181-4eb0-8185-7075b43dbec3)
 
 In MMPose, the modules used for data transformation are under `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)`, and their workflow is shown as follows:
 
diff --git a/docs/zh_cn/advanced_guides/codecs.md b/docs/zh_cn/advanced_guides/codecs.md
index 85d4d2e54b..60c588a239 100644
--- a/docs/zh_cn/advanced_guides/codecs.md
+++ b/docs/zh_cn/advanced_guides/codecs.md
@@ -8,7 +8,7 @@ MMPose 1.0 中引入了新模块 **编解码器（Codec）** ，将关键点数
 
 编解码器在工作流程中所处的位置如下所示：
 
-![codec-cn](https://user-images.githubusercontent.com/13503330/187829784-4d5939de-97d7-43cc-b934-c6d17c02d589.png)
+![pose_estimator_cn](https://github.com/open-mmlab/mmpose/assets/13503330/0c048f66-b889-4268-937f-71b8753b505f)
 
 一个编解码器主要包含两个部分：
 
diff --git a/docs/zh_cn/advanced_guides/implement_new_models.md b/docs/zh_cn/advanced_guides/implement_new_models.md
index e233850277..d3ed96bd37 100644
--- a/docs/zh_cn/advanced_guides/implement_new_models.md
+++ b/docs/zh_cn/advanced_guides/implement_new_models.md
@@ -30,7 +30,7 @@ mmpose
 
 在姿态估计范式中，我们会定义一个模型的推理流程，并在 `predict()` 中对模型输出结果进行解码，先将其从 `输出尺度空间` 用 [编解码器](./codecs.md) 变换到 `输入图片空间`，然后再结合元信息变换到 `原始图片空间`。
 
-![image](https://github.com/open-mmlab/mmpose/assets/13503330/e3e700ac-a047-4cff-9017-67f83676b8cb)
+![pose_estimator_cn](https://github.com/open-mmlab/mmpose/assets/13503330/0c048f66-b889-4268-937f-71b8753b505f)
 
 当前 MMPose 已支持以下几类算法范式：
 
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index 132233fd5d..b4c44192a5 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -296,7 +296,7 @@ test_pipeline = [
 
 数据在三个空间中变换的流程如图所示：
 
-![migration-cn](https://user-images.githubusercontent.com/13503330/187831574-13804daf-f498-47c2-ba43-64b8e6ffe3dd.png)
+![tour_cn](https://github.com/open-mmlab/mmpose/assets/13503330/4c989d86-e824-49ea-9ba8-b3978548db37)
 
 在MMPose中，数据变换所需要的模块在 `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)` 目录下，它们的工作流程如图所示：
 

From b5bb116a2539fc92548cfb1a4694e9f334042231 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Fri, 14 Jul 2023 17:20:29 +0800
Subject: [PATCH 06/37] [Feature] Support MotionBERT (#2482)

---
 configs/body_3d_keypoint/pose_lift/README.md  |  36 +--
 .../pose_lift/h36m/motionbert_h36m.md         |  53 +++
 .../pose_lift/h36m/motionbert_h36m.yml        |  34 ++
 ...-lift_motionbert-243frm_8xb32-120e_h36m.py | 140 ++++++++
 ...se3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py} |   2 +-
 ...deopose3d-243frm-supv_8xb128-160e_h36m.py} |   2 +-
 ...ideopose3d-27frm-supv_8xb128-120e_h36m.py} |   2 +-
 ...ideopose3d-81frm-supv_8xb128-160e_h36m.py} |   2 +-
 .../pose_lift/h36m/videopose3d_h36m.md        |  18 +-
 mmpose/apis/inference_3d.py                   |   6 +-
 mmpose/apis/inferencers/pose3d_inferencer.py  |   4 +-
 mmpose/codecs/__init__.py                     |   4 +-
 mmpose/codecs/image_pose_lifting.py           |  83 +++--
 mmpose/codecs/motionbert_label.py             | 218 +++++++++++++
 mmpose/codecs/utils/__init__.py               |   4 +-
 .../codecs/utils/camera_image_projection.py   |  69 ++++
 mmpose/codecs/video_pose_lifting.py           |  51 ++-
 .../datasets/base/base_mocap_dataset.py       |  27 +-
 .../datasets/datasets/body3d/h36m_dataset.py  |  98 ++++--
 mmpose/datasets/transforms/formatting.py      |   6 +-
 .../datasets/transforms/pose3d_transforms.py  |  18 +-
 .../evaluation/metrics/keypoint_3d_metrics.py |  40 ++-
 mmpose/models/backbones/__init__.py           |   3 +-
 mmpose/models/backbones/dstformer.py          | 304 ++++++++++++++++++
 mmpose/models/heads/__init__.py               |   5 +-
 .../models/heads/regression_heads/__init__.py |  10 +-
 .../motion_regression_head.py                 | 176 ++++++++++
 .../temporal_regression_head.py               |   2 +-
 .../trajectory_regression_head.py             |   2 +-
 mmpose/models/losses/regression_loss.py       |  78 +++++
 tests/test_codecs/test_image_pose_lifting.py  |  67 ++--
 tests/test_codecs/test_motionbert_label.py    | 159 +++++++++
 tests/test_codecs/test_video_pose_lifting.py  | 109 +++++--
 .../test_body_datasets/test_h36m_dataset.py   |  11 +
 .../test_transforms/test_pose3d_transforms.py |  39 ++-
 .../test_metrics/test_keypoint_3d_metrics.py  |  11 +-
 .../test_backbones/test_dstformer.py          |  36 +++
 37 files changed, 1738 insertions(+), 191 deletions(-)
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py => pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py => pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py => pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py => pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py} (98%)
 create mode 100644 mmpose/codecs/motionbert_label.py
 create mode 100644 mmpose/codecs/utils/camera_image_projection.py
 create mode 100644 mmpose/models/backbones/dstformer.py
 create mode 100644 mmpose/models/heads/regression_heads/motion_regression_head.py
 create mode 100644 tests/test_codecs/test_motionbert_label.py
 create mode 100644 tests/test_models/test_backbones/test_dstformer.py

diff --git a/configs/body_3d_keypoint/pose_lift/README.md b/configs/body_3d_keypoint/pose_lift/README.md
index 7e5f9f7e2a..e3e6ff7176 100644
--- a/configs/body_3d_keypoint/pose_lift/README.md
+++ b/configs/body_3d_keypoint/pose_lift/README.md
@@ -16,23 +16,19 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 
 #### Human3.6m Dataset
 
-| Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
-
-| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 27 | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 81 | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 1 | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
-
-| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
-
-| [VideoPose3D-semi-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
-
-| [VideoPose3D-semi-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+| Arch                                          | MPJPE | P-MPJPE | N-MPJPE |                     ckpt                      |                     log                      |              Details and Download               |
+| :-------------------------------------------- | :---: | :-----: | :-----: | :-------------------------------------------: | :------------------------------------------: | :---------------------------------------------: |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 40.1  |  30.1   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 39.1  |  29.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 37.6  |  28.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 53.0  |  41.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9  |  38.0   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |  27.7   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |  21.6   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+*Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
 
 ## Image-based Single-view 3D Human Body Pose Estimation
 
@@ -46,6 +42,6 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 
 #### Human3.6m Dataset
 
-| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
-| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: |
-| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4 | 34.3 | /|[ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) |
+| Arch                                      | MPJPE | P-MPJPE | N-MPJPE |                   ckpt                    |                    log                    |                    Details and Download                    |
+| :---------------------------------------- | :---: | :-----: | :-----: | :---------------------------------------: | :---------------------------------------: | :--------------------------------------------------------: |
+| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4  |  34.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | [simplebaseline3d_h36m.md](./h36m/simplebaseline3d_h36m.md) |
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
new file mode 100644
index 0000000000..d830d65c18
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -0,0 +1,53 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2210.06551">MotionBERT (2022)</a></summary>
+
+```bibtex
+ @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022,
+ title={Learning Human Motion Representations: A Unified Perspective},
+ author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou},
+ year={2022},
+ month={Oct},
+ language={en-US}
+ }
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
+title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+publisher = {IEEE Computer Society},
+volume = {36},
+number = {7},
+pages = {1325-1339},
+month = {jul},
+year = {2014}
+}
+```
+
+</details>
+
+Testing results on Human3.6M dataset with ground truth 2D detections
+
+| Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
+| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |     35.3      |  27.7   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |     27.4      |  21.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+Testing results on Human3.6M dataset from the [official repo](https://github.com/Walter0807/MotionBERT) with ground truth 2D detections
+
+| Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
+| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 40.5  |     39.9      |  34.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 38.2  |     37.7      |  32.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
new file mode 100644
index 0000000000..7257fea5a6
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: MotionBERT
+  Paper:
+    Title: "Learning Human Motion Representations: A Unified Perspective"
+    URL: https://arxiv.org/abs/2210.06551
+  README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md
+Models:
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+  In Collection: MotionBERT
+  Metadata:
+    Architecture: &id001
+    - MotionBERT
+    Training Data: Human3.6M
+  Name: vid_pl_motionbert_8xb32-120e_h36m
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 35.3
+      P-MPJPE: 27.7
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+  In Collection: MotionBERT
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: vid_pl_motionbert-finetuned_8xb32-120e_h36m
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 27.5
+      P-MPJPE: 21.6
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
new file mode 100644
index 0000000000..88f6c3897d
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel',
+    num_keypoints=17,
+    concat_vis=True,
+    rootrel=True,
+    factor_label=False)
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg={},
+        target_flip_cfg={},
+        flip_image=True),
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test.npz',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
index 0cbf89142d..c1190fe83e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
index 0f311ac5cf..0d241c498f 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
index 2589b493a6..803f907b7b 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
index f2c27e423d..4b370fe76e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
index f1c75d786a..48502c7b09 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
@@ -41,27 +41,27 @@ Testing results on Human3.6M dataset with ground truth 2D detections, supervised
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) |       243       |       |         | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) |       243       | 37.6  |  28.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
 
 Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, supervised training
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) |       243       |       |         | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) |       243       | 47.9  |  38.0   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
 
 Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training
 
 | Training Data |                        Arch                         | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                        ckpt                         |                         log                         |
 | :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
-| 10% S1        | [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) |       27        | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
+| 10% S1        | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) |       27        | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
 
 Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, semi-supervised training
 
-| Training Data |              Arch              | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                              ckpt                              |                              log                              |
-| :------------ | :----------------------------: | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
-| 10% S1        | [VideoPose3D](/configs/xxx.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+| Training Data |                        Arch                         | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                        ckpt                         |                         log                         |
+| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
+| 10% S1        | [VideoPose3D-semi-supervised-CPN-27frm](/configs/xxx.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
 
 <sup>1</sup> CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy).
diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py
index d5bb753945..d4b9623b86 100644
--- a/mmpose/apis/inference_3d.py
+++ b/mmpose/apis/inference_3d.py
@@ -316,8 +316,10 @@ def inference_pose_lifter_model(model,
             T,
             K,
         ), dtype=np.float32)
-        data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32)
-        data_info['lifting_target_visible'] = np.ones((K, 1), dtype=np.float32)
+        data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+        data_info['factor'] = np.zeros((T, ), dtype=np.float32)
+        data_info['lifting_target_visible'] = np.ones((1, K, 1),
+                                                      dtype=np.float32)
 
         if image_size is not None:
             assert len(image_size) == 2
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 0fe66ac72b..819273af66 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -271,8 +271,8 @@ def preprocess_single(self,
                 K,
             ),
                                                      dtype=np.float32)
-            data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32)
-            data_info['lifting_target_visible'] = np.ones((K, 1),
+            data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+            data_info['lifting_target_visible'] = np.ones((1, K, 1),
                                                           dtype=np.float32)
             data_info['camera_param'] = dict(w=width, h=height)
 
diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py
index cdbd8feb0c..1a48b7f851 100644
--- a/mmpose/codecs/__init__.py
+++ b/mmpose/codecs/__init__.py
@@ -4,6 +4,7 @@
 from .image_pose_lifting import ImagePoseLifting
 from .integral_regression_label import IntegralRegressionLabel
 from .megvii_heatmap import MegviiHeatmap
+from .motionbert_label import MotionBERTLabel
 from .msra_heatmap import MSRAHeatmap
 from .regression_label import RegressionLabel
 from .simcc_label import SimCCLabel
@@ -14,5 +15,6 @@
 __all__ = [
     'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel',
     'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR',
-    'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting'
+    'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting',
+    'MotionBERTLabel'
 ]
diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py
index 64bf925997..aae6c3b5be 100644
--- a/mmpose/codecs/image_pose_lifting.py
+++ b/mmpose/codecs/image_pose_lifting.py
@@ -25,6 +25,10 @@ class ImagePoseLifting(BaseKeypointCodec):
             Default: ``False``.
         save_index (bool): If true, store the root position separated from the
             original pose. Default: ``False``.
+        reshape_keypoints (bool): If true, reshape the keypoints into shape
+            (-1, N). Default: ``True``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
         keypoints_mean (np.ndarray, optional): Mean values of keypoints
             coordinates in shape (K, D).
         keypoints_std (np.ndarray, optional): Std values of keypoints
@@ -42,6 +46,8 @@ def __init__(self,
                  root_index: int,
                  remove_root: bool = False,
                  save_index: bool = False,
+                 reshape_keypoints: bool = True,
+                 concat_vis: bool = False,
                  keypoints_mean: Optional[np.ndarray] = None,
                  keypoints_std: Optional[np.ndarray] = None,
                  target_mean: Optional[np.ndarray] = None,
@@ -52,9 +58,23 @@ def __init__(self,
         self.root_index = root_index
         self.remove_root = remove_root
         self.save_index = save_index
-        if keypoints_mean is not None and keypoints_std is not None:
+        self.reshape_keypoints = reshape_keypoints
+        self.concat_vis = concat_vis
+        if keypoints_mean is not None:
+            keypoints_mean = np.array(
+                keypoints_mean,
+                dtype=np.float32).reshape(1, num_keypoints, -1)
+            keypoints_std = np.array(
+                keypoints_std, dtype=np.float32).reshape(1, num_keypoints, -1)
+            assert keypoints_std is not None
             assert keypoints_mean.shape == keypoints_std.shape
-        if target_mean is not None and target_std is not None:
+        if target_mean is not None:
+            target_dim = num_keypoints - 1 if remove_root else num_keypoints
+            target_mean = np.array(
+                target_mean, dtype=np.float32).reshape(1, target_dim, -1)
+            target_std = np.array(
+                target_std, dtype=np.float32).reshape(1, target_dim, -1)
+            assert target_std is not None
             assert target_mean.shape == target_std.shape
         self.keypoints_mean = keypoints_mean
         self.keypoints_std = keypoints_std
@@ -73,15 +93,17 @@ def encode(self,
             keypoints_visible (np.ndarray, optional): Keypoint visibilities in
                 shape (N, K).
             lifting_target (np.ndarray, optional): 3d target coordinate in
-                shape (K, C).
+                shape (T, K, C).
             lifting_target_visible (np.ndarray, optional): Target coordinate in
-                shape (K, ).
+                shape (T, K, ).
 
         Returns:
             encoded (dict): Contains the following items:
 
                 - keypoint_labels (np.ndarray): The processed keypoints in
-                  shape (K * D, N) where D is 2 for 2d coordinates.
+                  shape like (N, K, D) or (K * D, N).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N-1, K, ).
                 - lifting_target_label: The processed target coordinate in
                   shape (K, C) or (K-1, C).
                 - lifting_target_weights (np.ndarray): The target weights in
@@ -93,18 +115,20 @@ def encode(self,
 
                 In addition, there are some optional items it may contain:
 
+                - target_root (np.ndarray): The root coordinate of target in
+                  shape (C, ). Exists if ``zero_center`` is ``True``.
                 - target_root_removed (bool): Indicate whether the root of
-                  pose lifting target is removed. Added if ``self.remove_root``
-                  is ``True``.
+                  pose-lifitng target is removed. Exists if
+                  ``remove_root`` is ``True``.
                 - target_root_index (int): An integer indicating the index of
-                  root. Added if ``self.remove_root`` and ``self.save_index``
+                  root. Exists if ``remove_root`` and ``save_index``
                   are ``True``.
         """
         if keypoints_visible is None:
             keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
 
         if lifting_target is None:
-            lifting_target = keypoints[0]
+            lifting_target = [keypoints[0]]
 
         # set initial value for `lifting_target_weights`
         # and `trajectory_weights`
@@ -126,13 +150,16 @@ def encode(self,
             f'Got invalid joint shape {lifting_target.shape}'
 
         root = lifting_target[..., self.root_index, :]
-        lifting_target_label = lifting_target - root
+        lifting_target_label = lifting_target - lifting_target[
+            ..., self.root_index:self.root_index + 1, :]
 
         if self.remove_root:
             lifting_target_label = np.delete(
                 lifting_target_label, self.root_index, axis=-2)
-            assert lifting_target_weights.ndim in {1, 2}
-            axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1
+            lifting_target_visible = np.delete(
+                lifting_target_visible, self.root_index, axis=-2)
+            assert lifting_target_weights.ndim in {2, 3}
+            axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
             lifting_target_weights = np.delete(
                 lifting_target_weights, self.root_index, axis=axis_to_remove)
             # Add a flag to avoid latter transforms that rely on the root
@@ -145,15 +172,17 @@ def encode(self,
 
         # Normalize the 2D keypoint coordinate with mean and std
         keypoint_labels = keypoints.copy()
-        if self.keypoints_mean is not None and self.keypoints_std is not None:
-            keypoints_shape = keypoints.shape
-            assert self.keypoints_mean.shape == keypoints_shape[1:]
+        if self.keypoints_mean is not None:
+            assert self.keypoints_mean.shape[1:] == keypoints.shape[1:]
+            encoded['keypoints_mean'] = self.keypoints_mean.copy()
+            encoded['keypoints_std'] = self.keypoints_std.copy()
 
             keypoint_labels = (keypoint_labels -
                                self.keypoints_mean) / self.keypoints_std
-        if self.target_mean is not None and self.target_std is not None:
-            target_shape = lifting_target_label.shape
-            assert self.target_mean.shape == target_shape
+        if self.target_mean is not None:
+            assert self.target_mean.shape == lifting_target_label.shape
+            encoded['target_mean'] = self.target_mean.copy()
+            encoded['target_std'] = self.target_std.copy()
 
             lifting_target_label = (lifting_target_label -
                                     self.target_mean) / self.target_std
@@ -163,7 +192,19 @@ def encode(self,
         if keypoint_labels.ndim == 2:
             keypoint_labels = keypoint_labels[None, ...]
 
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        if self.reshape_keypoints:
+            N = keypoint_labels.shape[0]
+            keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N)
+
         encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
         encoded['lifting_target_label'] = lifting_target_label
         encoded['lifting_target_weights'] = lifting_target_weights
         encoded['trajectory_weights'] = trajectory_weights
@@ -190,11 +231,11 @@ def decode(self,
         keypoints = encoded.copy()
 
         if self.target_mean is not None and self.target_std is not None:
-            assert self.target_mean.shape == keypoints.shape[1:]
+            assert self.target_mean.shape == keypoints.shape
             keypoints = keypoints * self.target_std + self.target_mean
 
-        if target_root.size > 0:
-            keypoints = keypoints + np.expand_dims(target_root, axis=0)
+        if target_root is not None and target_root.size > 0:
+            keypoints = keypoints + target_root
             if self.remove_root:
                 keypoints = np.insert(
                     keypoints, self.root_index, target_root, axis=1)
diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py
new file mode 100644
index 0000000000..d0c8cd0d40
--- /dev/null
+++ b/mmpose/codecs/motionbert_label.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from copy import deepcopy
+from typing import Optional, Tuple
+
+import numpy as np
+
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import camera_to_image_coord
+
+
+@KEYPOINT_CODECS.register_module()
+class MotionBERTLabel(BaseKeypointCodec):
+    r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al
+    (2022).
+
+    Note:
+
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - pose-lifitng target dimension: C
+
+    Args:
+        num_keypoints (int): The number of keypoints in the dataset.
+        root_index (int): Root keypoint index in the pose. Default: 0.
+        remove_root (bool): If true, remove the root keypoint from the pose.
+            Default: ``False``.
+        save_index (bool): If true, store the root position separated from the
+            original pose, only takes effect if ``remove_root`` is ``True``.
+            Default: ``False``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
+        rootrel (bool): If true, the root keypoint will be set to the
+            coordinate origin. Default: ``False``.
+        factor_label (bool): If true, the label will be multiplied by a factor.
+            Default: ``True``.
+    """
+
+    auxiliary_encode_keys = {
+        'lifting_target', 'lifting_target_visible', 'camera_param', 'factor'
+    }
+
+    def __init__(self,
+                 num_keypoints: int,
+                 root_index: int = 0,
+                 remove_root: bool = False,
+                 save_index: bool = False,
+                 concat_vis: bool = False,
+                 rootrel: bool = False,
+                 factor_label: bool = True):
+        super().__init__()
+
+        self.num_keypoints = num_keypoints
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.save_index = save_index
+        self.concat_vis = concat_vis
+        self.rootrel = rootrel
+        self.factor_label = factor_label
+
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               lifting_target: Optional[np.ndarray] = None,
+               lifting_target_visible: Optional[np.ndarray] = None,
+               camera_param: Optional[dict] = None,
+               factor: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints from input image space to normalized space.
+
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D).
+            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
+                shape (B, T, K).
+            lifting_target (np.ndarray, optional): 3d target coordinate in
+                shape (T, K, C).
+            lifting_target_visible (np.ndarray, optional): Target coordinate in
+                shape (T, K, ).
+            camera_param (dict, optional): The camera parameter dictionary.
+            factor (np.ndarray, optional): The factor mapping camera and image
+                  coordinate in shape (T, ).
+
+        Returns:
+            encoded (dict): Contains the following items:
+
+                - keypoint_labels (np.ndarray): The processed keypoints in
+                  shape like (N, K, D).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N, K-1, ).
+                - lifting_target_label: The processed target coordinate in
+                  shape (K, C) or (K-1, C).
+                - lifting_target_weights (np.ndarray): The target weights in
+                  shape (K, ) or (K-1, ).
+                - trajectory_weights (np.ndarray): The trajectory weights in
+                  shape (K, ).
+                - factor (np.ndarray): The factor mapping camera and image
+                  coordinate in shape (T, 1).
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+
+        if lifting_target is None:
+            lifting_target = [keypoints[..., 0, :, :]]
+
+        # set initial value for `lifting_target_weights`
+        # and `trajectory_weights`
+        if lifting_target_visible is None:
+            lifting_target_visible = np.ones(
+                lifting_target.shape[:-1], dtype=np.float32)
+            lifting_target_weights = lifting_target_visible
+            trajectory_weights = (1 / lifting_target[:, 2])
+        else:
+            valid = lifting_target_visible > 0.5
+            lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32)
+            trajectory_weights = lifting_target_weights
+
+        if camera_param is None:
+            camera_param = dict()
+
+        encoded = dict()
+
+        lifting_target_label = lifting_target.copy()
+        keypoint_labels = keypoints.copy()
+
+        assert keypoint_labels.ndim in {2, 3}
+        if keypoint_labels.ndim == 2:
+            keypoint_labels = keypoint_labels[None, ...]
+
+        # Normalize the 2D keypoint coordinate with image width and height
+        _camera_param = deepcopy(camera_param)
+        assert 'w' in _camera_param and 'h' in _camera_param
+        w, h = _camera_param['w'], _camera_param['h']
+        keypoint_labels[
+            ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w]
+
+        # convert target to image coordinate
+        T = keypoint_labels.shape[0]
+        factor_ = np.array([4] * T, dtype=np.float32).reshape(T, )
+        if 'f' in _camera_param and 'c' in _camera_param:
+            lifting_target_label, factor_ = camera_to_image_coord(
+                self.root_index, lifting_target_label, _camera_param)
+        lifting_target_label[..., :, :] = lifting_target_label[
+            ..., :, :] - lifting_target_label[...,
+                                              self.root_index:self.root_index +
+                                              1, :]
+        if factor is None or factor[0] == 0:
+            factor = factor_
+        if factor.ndim == 1:
+            factor = factor[:, None]
+        if self.factor_label:
+            lifting_target_label *= factor[..., None]
+
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
+        encoded['lifting_target_label'] = lifting_target_label
+        encoded['lifting_target_weights'] = lifting_target_weights
+        encoded['lifting_target'] = lifting_target_label
+        encoded['lifting_target_visible'] = lifting_target_visible
+        encoded['trajectory_weights'] = trajectory_weights
+        encoded['factor'] = factor
+
+        return encoded
+
+    def decode(
+        self,
+        encoded: np.ndarray,
+        w: Optional[np.ndarray] = None,
+        h: Optional[np.ndarray] = None,
+        factor: Optional[np.ndarray] = None,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, C).
+            w (np.ndarray, optional): The image widths in shape (N, ).
+                Default: ``None``.
+            h (np.ndarray, optional): The image heights in shape (N, ).
+                Default: ``None``.
+            factor (np.ndarray, optional): The factor for projection in shape
+                (N, ). Default: ``None``.
+
+        Returns:
+            keypoints (np.ndarray): Decoded coordinates in shape (N, K, C).
+            scores (np.ndarray): The keypoint scores in shape (N, K).
+        """
+        keypoints = encoded.copy()
+        scores = np.ones(keypoints.shape[:-1], dtype=np.float32)
+
+        if self.rootrel:
+            keypoints[..., 0, :] = 0
+
+        if w is not None and w.size > 0:
+            assert w.shape == h.shape
+            assert w.shape[0] == keypoints.shape[0]
+            assert w.ndim in {1, 2}
+            if w.ndim == 1:
+                w = w[:, None]
+                h = h[:, None]
+            trans = np.append(
+                np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :]
+            keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2
+            keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2
+        if factor is not None and factor.size > 0:
+            assert factor.shape[0] == keypoints.shape[0]
+            keypoints *= factor[..., None]
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[
+            ..., self.root_index:self.root_index + 1, :]
+        keypoints /= 1000.
+        return keypoints, scores
diff --git a/mmpose/codecs/utils/__init__.py b/mmpose/codecs/utils/__init__.py
index eaa093f12b..38bbae5c39 100644
--- a/mmpose/codecs/utils/__init__.py
+++ b/mmpose/codecs/utils/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .camera_image_projection import camera_to_image_coord, camera_to_pixel
 from .gaussian_heatmap import (generate_gaussian_heatmaps,
                                generate_udp_gaussian_heatmaps,
                                generate_unbiased_gaussian_heatmaps)
@@ -19,5 +20,6 @@
     'batch_heatmap_nms', 'refine_keypoints', 'refine_keypoints_dark',
     'refine_keypoints_dark_udp', 'generate_displacement_heatmap',
     'refine_simcc_dark', 'gaussian_blur1d', 'get_diagonal_lengths',
-    'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized'
+    'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized',
+    'camera_to_image_coord', 'camera_to_pixel'
 ]
diff --git a/mmpose/codecs/utils/camera_image_projection.py b/mmpose/codecs/utils/camera_image_projection.py
new file mode 100644
index 0000000000..5ed4d14109
--- /dev/null
+++ b/mmpose/codecs/utils/camera_image_projection.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import numpy as np
+
+
+def camera_to_image_coord(root_index: int, kpts_3d_cam: np.ndarray,
+                          camera_param: Dict) -> Tuple[np.ndarray, np.ndarray]:
+    """Project keypoints from camera space to image space and calculate factor.
+
+    Args:
+        root_index (int): Index for root keypoint.
+        kpts_3d_cam (np.ndarray): Keypoint coordinates in camera space in
+            shape (N, K, D).
+        camera_param (dict): Parameters for the camera.
+
+    Returns:
+        tuple:
+        - kpts_3d_image (np.ndarray): Keypoint coordinates in image space in
+            shape (N, K, D).
+        - factor (np.ndarray): The scaling factor that maps keypoints from
+            image space to camera space in shape (N, ).
+    """
+
+    root = kpts_3d_cam[..., root_index, :]
+    tl_kpt = root.copy()
+    tl_kpt[..., :2] -= 1.0
+    br_kpt = root.copy()
+    br_kpt[..., :2] += 1.0
+    tl_kpt = np.reshape(tl_kpt, (-1, 3))
+    br_kpt = np.reshape(br_kpt, (-1, 3))
+    fx, fy = camera_param['f'] / 1000.
+    cx, cy = camera_param['c'] / 1000.
+
+    tl2d = camera_to_pixel(tl_kpt, fx, fy, cx, cy)
+    br2d = camera_to_pixel(br_kpt, fx, fy, cx, cy)
+
+    rectangle_3d_size = 2.0
+    kpts_3d_image = np.zeros_like(kpts_3d_cam)
+    kpts_3d_image[..., :2] = camera_to_pixel(kpts_3d_cam.copy(), fx, fy, cx,
+                                             cy)
+    ratio = (br2d[..., 0] - tl2d[..., 0] + 0.001) / rectangle_3d_size
+    factor = rectangle_3d_size / (br2d[..., 0] - tl2d[..., 0] + 0.001)
+    kpts_3d_depth = ratio[:, None] * (
+        kpts_3d_cam[..., 2] - kpts_3d_cam[..., root_index:root_index + 1, 2])
+    kpts_3d_image[..., 2] = kpts_3d_depth
+    return kpts_3d_image, factor
+
+
+def camera_to_pixel(kpts_3d: np.ndarray, fx: float, fy: float, cx: float,
+                    cy: float) -> np.ndarray:
+    """Project keypoints from camera space to image space.
+
+    Args:
+        kpts_3d (np.ndarray): Keypoint coordinates in camera space.
+        fx (float): x-coordinate of camera's focal length.
+        fy (float): y-coordinate of camera's focal length.
+        cx (float): x-coordinate of image center.
+        cy (float): y-coordinate of image center.
+
+    Returns:
+        pose_2d (np.ndarray): Projected keypoint coordinates in image space.
+    """
+    pose_2d = kpts_3d[..., :2] / kpts_3d[..., 2:3]
+    pose_2d[..., 0] *= fx
+    pose_2d[..., 1] *= fy
+    pose_2d[..., 0] += cx
+    pose_2d[..., 1] += cy
+    return pose_2d
diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py
index 56cf35fa2d..9e409a663c 100644
--- a/mmpose/codecs/video_pose_lifting.py
+++ b/mmpose/codecs/video_pose_lifting.py
@@ -30,6 +30,10 @@ class VideoPoseLifting(BaseKeypointCodec):
         save_index (bool): If true, store the root position separated from the
             original pose, only takes effect if ``remove_root`` is ``True``.
             Default: ``False``.
+        reshape_keypoints (bool): If true, reshape the keypoints into shape
+            (-1, N). Default: ``True``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
         normalize_camera (bool): Whether to normalize camera intrinsics.
             Default: ``False``.
     """
@@ -44,6 +48,8 @@ def __init__(self,
                  root_index: int = 0,
                  remove_root: bool = False,
                  save_index: bool = False,
+                 reshape_keypoints: bool = True,
+                 concat_vis: bool = False,
                  normalize_camera: bool = False):
         super().__init__()
 
@@ -52,6 +58,8 @@ def __init__(self,
         self.root_index = root_index
         self.remove_root = remove_root
         self.save_index = save_index
+        self.reshape_keypoints = reshape_keypoints
+        self.concat_vis = concat_vis
         self.normalize_camera = normalize_camera
 
     def encode(self,
@@ -67,16 +75,18 @@ def encode(self,
             keypoints_visible (np.ndarray, optional): Keypoint visibilities in
                 shape (N, K).
             lifting_target (np.ndarray, optional): 3d target coordinate in
-                shape (K, C).
+                shape (T, K, C).
             lifting_target_visible (np.ndarray, optional): Target coordinate in
-                shape (K, ).
+                shape (T, K, ).
             camera_param (dict, optional): The camera parameter dictionary.
 
         Returns:
             encoded (dict): Contains the following items:
 
                 - keypoint_labels (np.ndarray): The processed keypoints in
-                  shape (K * D, N) where D is 2 for 2d coordinates.
+                  shape like (N, K, D) or (K * D, N).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N-1, K, ).
                 - lifting_target_label: The processed target coordinate in
                   shape (K, C) or (K-1, C).
                 - lifting_target_weights (np.ndarray): The target weights in
@@ -87,21 +97,21 @@ def encode(self,
                 In addition, there are some optional items it may contain:
 
                 - target_root (np.ndarray): The root coordinate of target in
-                  shape (C, ). Exists if ``self.zero_center`` is ``True``.
+                  shape (C, ). Exists if ``zero_center`` is ``True``.
                 - target_root_removed (bool): Indicate whether the root of
                   pose-lifitng target is removed. Exists if
-                  ``self.remove_root`` is ``True``.
+                  ``remove_root`` is ``True``.
                 - target_root_index (int): An integer indicating the index of
-                  root. Exists if ``self.remove_root`` and ``self.save_index``
+                  root. Exists if ``remove_root`` and ``save_index``
                   are ``True``.
                 - camera_param (dict): The updated camera parameter dictionary.
-                  Exists if ``self.normalize_camera`` is ``True``.
+                  Exists if ``normalize_camera`` is ``True``.
         """
         if keypoints_visible is None:
             keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
 
         if lifting_target is None:
-            lifting_target = keypoints[0]
+            lifting_target = [keypoints[0]]
 
         # set initial value for `lifting_target_weights`
         # and `trajectory_weights`
@@ -128,14 +138,17 @@ def encode(self,
                 f'Got invalid joint shape {lifting_target.shape}'
 
             root = lifting_target[..., self.root_index, :]
-            lifting_target_label = lifting_target_label - root
+            lifting_target_label -= lifting_target_label[
+                ..., self.root_index:self.root_index + 1, :]
             encoded['target_root'] = root
 
             if self.remove_root:
                 lifting_target_label = np.delete(
                     lifting_target_label, self.root_index, axis=-2)
-                assert lifting_target_weights.ndim in {1, 2}
-                axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1
+                lifting_target_visible = np.delete(
+                    lifting_target_visible, self.root_index, axis=-2)
+                assert lifting_target_weights.ndim in {2, 3}
+                axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
                 lifting_target_weights = np.delete(
                     lifting_target_weights,
                     self.root_index,
@@ -167,7 +180,19 @@ def encode(self,
             _camera_param['c'] = (_camera_param['c'] - center[:, None]) / scale
             encoded['camera_param'] = _camera_param
 
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        if self.reshape_keypoints:
+            N = keypoint_labels.shape[0]
+            keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N)
+
         encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoints_visible'] = keypoints_visible
         encoded['lifting_target_label'] = lifting_target_label
         encoded['lifting_target_weights'] = lifting_target_weights
         encoded['trajectory_weights'] = trajectory_weights
@@ -192,8 +217,8 @@ def decode(self,
         """
         keypoints = encoded.copy()
 
-        if target_root.size > 0:
-            keypoints = keypoints + np.expand_dims(target_root, axis=0)
+        if target_root is not None and target_root.size > 0:
+            keypoints = keypoints + target_root
             if self.remove_root:
                 keypoints = np.insert(
                     keypoints, self.root_index, target_root, axis=1)
diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py
index d671a6ae94..e08ba6ea45 100644
--- a/mmpose/datasets/datasets/base/base_mocap_dataset.py
+++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import itertools
 import os.path as osp
 from copy import deepcopy
 from itertools import filterfalse, groupby
@@ -21,6 +22,8 @@ class BaseMocapDataset(BaseDataset):
     Args:
         ann_file (str): Annotation file path. Default: ''.
         seq_len (int): Number of frames in a sequence. Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
         causal (bool): If set to ``True``, the rightmost input frame will be
             the target frame. Otherwise, the middle input frame will be the
             target frame. Default: ``True``.
@@ -63,6 +66,7 @@ class BaseMocapDataset(BaseDataset):
     def __init__(self,
                  ann_file: str = '',
                  seq_len: int = 1,
+                 multiple_target: int = 0,
                  causal: bool = True,
                  subset_frac: float = 1.0,
                  camera_param_file: Optional[str] = None,
@@ -102,6 +106,10 @@ def __init__(self,
         self.seq_len = seq_len
         self.causal = causal
 
+        self.multiple_target = multiple_target
+        if self.multiple_target:
+            assert (self.seq_len == 1)
+
         assert 0 < subset_frac <= 1, (
             f'Unsupported `subset_frac` {subset_frac}. Supported range '
             'is (0, 1].')
@@ -241,6 +249,17 @@ def get_sequence_indices(self) -> List[List[int]]:
             sequence_indices = [[idx] for idx in range(num_imgs)]
         else:
             raise NotImplementedError('Multi-frame data sample unsupported!')
+
+        if self.multiple_target > 0:
+            sequence_indices_merged = []
+            for i in range(0, len(sequence_indices), self.multiple_target):
+                if i + self.multiple_target > len(sequence_indices):
+                    break
+                sequence_indices_merged.append(
+                    list(
+                        itertools.chain.from_iterable(
+                            sequence_indices[i:i + self.multiple_target])))
+            sequence_indices = sequence_indices_merged
         return sequence_indices
 
     def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
@@ -274,7 +293,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         image_list = []
 
         for idx, frame_ids in enumerate(self.sequence_indices):
-            assert len(frame_ids) == self.seq_len
+            assert len(frame_ids) == (self.multiple_target
+                                      if self.multiple_target else
+                                      self.seq_len), f'{len(frame_ids)}'
 
             _img_names = img_names[frame_ids]
 
@@ -286,7 +307,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
             keypoints_3d = _keypoints_3d[..., :3]
             keypoints_3d_visible = _keypoints_3d[..., 3]
 
-            target_idx = -1 if self.causal else int(self.seq_len) // 2
+            target_idx = [-1] if self.causal else [int(self.seq_len) // 2]
+            if self.multiple_target:
+                target_idx = list(range(self.multiple_target))
 
             instance_info = {
                 'num_keypoints': num_keypoints,
diff --git a/mmpose/datasets/datasets/body3d/h36m_dataset.py b/mmpose/datasets/datasets/body3d/h36m_dataset.py
index 60094aa254..b7a4f71d65 100644
--- a/mmpose/datasets/datasets/body3d/h36m_dataset.py
+++ b/mmpose/datasets/datasets/body3d/h36m_dataset.py
@@ -45,6 +45,10 @@ class Human36mDataset(BaseMocapDataset):
         seq_len (int): Number of frames in a sequence. Default: 1.
         seq_step (int): The interval for extracting frames from the video.
             Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
+        multiple_target_step (int): The interval for merging sequence. Only
+            valid when ``multiple_target`` is larger than 0. Default: 0.
         pad_video_seq (bool): Whether to pad the video so that poses will be
             predicted for every frame in the video. Default: ``False``.
         causal (bool): If set to ``True``, the rightmost input frame will be
@@ -65,6 +69,9 @@ class Human36mDataset(BaseMocapDataset):
             If set, 2d keypoint loaded from this file will be used instead of
             ground-truth keypoints. This setting is only when
             ``keypoint_2d_src`` is ``'detection'``. Default: ``None``.
+        factor_file (str, optional): The projection factors' file. If set,
+            factor loaded from this file will be used instead of calculated
+            factors. Default: ``None``.
         camera_param_file (str): Cameras' parameters file. Default: ``None``.
         data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
             ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
@@ -104,11 +111,14 @@ def __init__(self,
                  ann_file: str = '',
                  seq_len: int = 1,
                  seq_step: int = 1,
+                 multiple_target: int = 0,
+                 multiple_target_step: int = 0,
                  pad_video_seq: bool = False,
                  causal: bool = True,
                  subset_frac: float = 1.0,
                  keypoint_2d_src: str = 'gt',
                  keypoint_2d_det_file: Optional[str] = None,
+                 factor_file: Optional[str] = None,
                  camera_param_file: Optional[str] = None,
                  data_mode: str = 'topdown',
                  metainfo: Optional[dict] = None,
@@ -138,9 +148,20 @@ def __init__(self,
         self.seq_step = seq_step
         self.pad_video_seq = pad_video_seq
 
+        if factor_file:
+            if not is_abs(factor_file):
+                factor_file = osp.join(data_root, factor_file)
+            assert exists(factor_file), 'Annotation file does not exist.'
+        self.factor_file = factor_file
+
+        if multiple_target > 0 and multiple_target_step == 0:
+            multiple_target_step = multiple_target
+        self.multiple_target_step = multiple_target_step
+
         super().__init__(
             ann_file=ann_file,
             seq_len=seq_len,
+            multiple_target=multiple_target,
             causal=causal,
             subset_frac=subset_frac,
             camera_param_file=camera_param_file,
@@ -171,41 +192,55 @@ def get_sequence_indices(self) -> List[List[int]]:
         sequence_indices = []
         _len = (self.seq_len - 1) * self.seq_step + 1
         _step = self.seq_step
-        for _, _indices in sorted(video_frames.items()):
-            n_frame = len(_indices)
-
-            if self.pad_video_seq:
-                # Pad the sequence so that every frame in the sequence will be
-                # predicted.
-                if self.causal:
-                    frames_left = self.seq_len - 1
-                    frames_right = 0
-                else:
-                    frames_left = (self.seq_len - 1) // 2
-                    frames_right = frames_left
-                for i in range(n_frame):
-                    pad_left = max(0, frames_left - i // _step)
-                    pad_right = max(0,
-                                    frames_right - (n_frame - 1 - i) // _step)
-                    start = max(i % _step, i - frames_left * _step)
-                    end = min(n_frame - (n_frame - 1 - i) % _step,
-                              i + frames_right * _step + 1)
-                    sequence_indices.append([_indices[0]] * pad_left +
-                                            _indices[start:end:_step] +
-                                            [_indices[-1]] * pad_right)
-            else:
+
+        if self.multiple_target:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
                 seqs_from_video = [
-                    _indices[i:(i + _len):_step]
-                    for i in range(0, n_frame - _len + 1)
-                ]
+                    _indices[i:(i + self.multiple_target):_step]
+                    for i in range(0, n_frame, self.multiple_target_step)
+                ][:(n_frame + self.multiple_target_step -
+                    self.multiple_target) // self.multiple_target_step]
                 sequence_indices.extend(seqs_from_video)
 
+        else:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
+
+                if self.pad_video_seq:
+                    # Pad the sequence so that every frame in the sequence will
+                    # be predicted.
+                    if self.causal:
+                        frames_left = self.seq_len - 1
+                        frames_right = 0
+                    else:
+                        frames_left = (self.seq_len - 1) // 2
+                        frames_right = frames_left
+                    for i in range(n_frame):
+                        pad_left = max(0, frames_left - i // _step)
+                        pad_right = max(
+                            0, frames_right - (n_frame - 1 - i) // _step)
+                        start = max(i % _step, i - frames_left * _step)
+                        end = min(n_frame - (n_frame - 1 - i) % _step,
+                                  i + frames_right * _step + 1)
+                        sequence_indices.append([_indices[0]] * pad_left +
+                                                _indices[start:end:_step] +
+                                                [_indices[-1]] * pad_right)
+                else:
+                    seqs_from_video = [
+                        _indices[i:(i + _len):_step]
+                        for i in range(0, n_frame - _len + 1)
+                    ]
+                    sequence_indices.extend(seqs_from_video)
+
         # reduce dataset size if needed
         subset_size = int(len(sequence_indices) * self.subset_frac)
         start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
         end = start + subset_size
 
-        return sequence_indices[start:end]
+        sequence_indices = sequence_indices[start:end]
+
+        return sequence_indices
 
     def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         instance_list, image_list = super()._load_annotations()
@@ -230,6 +265,15 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                     'keypoints_visible':
                     keypoints_visible
                 })
+        if self.factor_file:
+            with get_local_path(self.factor_file) as local_path:
+                factors = np.load(local_path).astype(np.float32)
+        else:
+            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
+        assert factors.shape[0] == kpts_3d.shape[0]
+        for idx, frame_ids in enumerate(self.sequence_indices):
+            factor = factors[frame_ids].astype(np.float32)
+            instance_list[idx].update({'factor': factor})
 
         return instance_list, image_list
 
diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py
index 05aeef179f..d047cff3c3 100644
--- a/mmpose/datasets/transforms/formatting.py
+++ b/mmpose/datasets/transforms/formatting.py
@@ -51,8 +51,6 @@ def keypoints_to_tensor(keypoints: Union[np.ndarray, Sequence[np.ndarray]]
     """
     if isinstance(keypoints, np.ndarray):
         keypoints = np.ascontiguousarray(keypoints)
-        N = keypoints.shape[0]
-        keypoints = keypoints.transpose(1, 2, 0).reshape(-1, N)
         tensor = torch.from_numpy(keypoints).contiguous()
     else:
         assert is_seq_of(keypoints, np.ndarray)
@@ -209,9 +207,9 @@ def transform(self, results: dict) -> dict:
         for key, packed_key in self.label_mapping_table.items():
             if key in results:
                 # For pose-lifting, store only target-related fields
-                if 'lifting_target_label' in results and key in {
+                if 'lifting_target' in results and packed_key in {
                         'keypoint_labels', 'keypoint_weights',
-                        'transformed_keypoints_visible'
+                        'keypoints_visible'
                 }:
                     continue
                 if isinstance(results[key], list):
diff --git a/mmpose/datasets/transforms/pose3d_transforms.py b/mmpose/datasets/transforms/pose3d_transforms.py
index e6559fa398..2149d7cb30 100644
--- a/mmpose/datasets/transforms/pose3d_transforms.py
+++ b/mmpose/datasets/transforms/pose3d_transforms.py
@@ -25,6 +25,8 @@ class RandomFlipAroundRoot(BaseTransform):
         flip_prob (float): Probability of flip. Default: 0.5.
         flip_camera (bool): Whether to flip horizontal distortion coefficients.
             Default: ``False``.
+        flip_image (bool): Whether to flip keypoints horizontally according
+            to image size. Default: ``False``.
 
     Required keys:
         keypoints
@@ -39,14 +41,16 @@ def __init__(self,
                  keypoints_flip_cfg,
                  target_flip_cfg,
                  flip_prob=0.5,
-                 flip_camera=False):
+                 flip_camera=False,
+                 flip_image=False):
         self.keypoints_flip_cfg = keypoints_flip_cfg
         self.target_flip_cfg = target_flip_cfg
         self.flip_prob = flip_prob
         self.flip_camera = flip_camera
+        self.flip_image = flip_image
 
     def transform(self, results: Dict) -> dict:
-        """The transform function of :class:`ZeroCenterPose`.
+        """The transform function of :class:`RandomFlipAroundRoot`.
 
         See ``transform()`` method of :class:`BaseTransform` for details.
 
@@ -76,6 +80,15 @@ def transform(self, results: Dict) -> dict:
                 flip_indices = results['flip_indices']
 
             # flip joint coordinates
+            _camera_param = deepcopy(results['camera_param'])
+            if self.flip_image:
+                assert 'camera_param' in results, \
+                    'Camera parameters are missing.'
+                assert 'w' in _camera_param
+                w = _camera_param['w'] / 2
+                self.keypoints_flip_cfg['center_x'] = w
+                self.target_flip_cfg['center_x'] = w
+
             keypoints, keypoints_visible = flip_keypoints_custom_center(
                 keypoints, keypoints_visible, flip_indices,
                 **self.keypoints_flip_cfg)
@@ -92,7 +105,6 @@ def transform(self, results: Dict) -> dict:
             if self.flip_camera:
                 assert 'camera_param' in results, \
                     'Camera parameters are missing.'
-                _camera_param = deepcopy(results['camera_param'])
 
                 assert 'c' in _camera_param
                 _camera_param['c'][0] *= -1
diff --git a/mmpose/evaluation/metrics/keypoint_3d_metrics.py b/mmpose/evaluation/metrics/keypoint_3d_metrics.py
index e945650c30..fb3447bb3f 100644
--- a/mmpose/evaluation/metrics/keypoint_3d_metrics.py
+++ b/mmpose/evaluation/metrics/keypoint_3d_metrics.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import defaultdict
 from os import path as osp
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 import numpy as np
 from mmengine.evaluator import BaseMetric
@@ -38,6 +38,8 @@ class MPJPE(BaseMetric):
             names to disambiguate homonymous metrics of different evaluators.
             If prefix is not provided in the argument, ``self.default_prefix``
             will be used instead. Default: ``None``.
+        skip_list (list, optional): The list of subject and action combinations
+            to be skipped. Default: [].
     """
 
     ALIGNMENT = {'mpjpe': 'none', 'p-mpjpe': 'procrustes', 'n-mpjpe': 'scale'}
@@ -45,7 +47,8 @@ class MPJPE(BaseMetric):
     def __init__(self,
                  mode: str = 'mpjpe',
                  collect_device: str = 'cpu',
-                 prefix: Optional[str] = None) -> None:
+                 prefix: Optional[str] = None,
+                 skip_list: List[str] = []) -> None:
         super().__init__(collect_device=collect_device, prefix=prefix)
         allowed_modes = self.ALIGNMENT.keys()
         if mode not in allowed_modes:
@@ -53,6 +56,7 @@ def __init__(self,
                            f"'n-mpjpe', but got '{mode}'.")
 
         self.mode = mode
+        self.skip_list = skip_list
 
     def process(self, data_batch: Sequence[dict],
                 data_samples: Sequence[dict]) -> None:
@@ -67,24 +71,32 @@ def process(self, data_batch: Sequence[dict],
                 the model.
         """
         for data_sample in data_samples:
-            # predicted keypoints coordinates, [1, K, D]
+            # predicted keypoints coordinates, [T, K, D]
             pred_coords = data_sample['pred_instances']['keypoints']
+            if pred_coords.ndim == 4:
+                pred_coords = np.squeeze(pred_coords, axis=0)
             # ground truth data_info
             gt = data_sample['gt_instances']
-            # ground truth keypoints coordinates, [1, K, D]
+            # ground truth keypoints coordinates, [T, K, D]
             gt_coords = gt['lifting_target']
-            # ground truth keypoints_visible, [1, K, 1]
-            mask = gt['lifting_target_visible'].astype(bool).reshape(1, -1)
+            # ground truth keypoints_visible, [T, K, 1]
+            mask = gt['lifting_target_visible'].astype(bool).reshape(
+                gt_coords.shape[0], -1)
             # instance action
-            img_path = data_sample['target_img_path']
+            img_path = data_sample['target_img_path'][0]
             _, rest = osp.basename(img_path).split('_', 1)
             action, _ = rest.split('.', 1)
+            actions = np.array([action] * gt_coords.shape[0])
+
+            subj_act = osp.basename(img_path).split('.')[0]
+            if subj_act in self.skip_list:
+                continue
 
             result = {
                 'pred_coords': pred_coords,
                 'gt_coords': gt_coords,
                 'mask': mask,
-                'action': action
+                'actions': actions
             }
 
             self.results.append(result)
@@ -104,16 +116,15 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
         # pred_coords: [N, K, D]
         pred_coords = np.concatenate(
             [result['pred_coords'] for result in results])
-        if pred_coords.ndim == 4 and pred_coords.shape[1] == 1:
-            pred_coords = np.squeeze(pred_coords, axis=1)
         # gt_coords: [N, K, D]
-        gt_coords = np.stack([result['gt_coords'] for result in results])
+        gt_coords = np.concatenate([result['gt_coords'] for result in results])
         # mask: [N, K]
         mask = np.concatenate([result['mask'] for result in results])
         # action_category_indices: Dict[List[int]]
         action_category_indices = defaultdict(list)
-        for idx, result in enumerate(results):
-            action_category = result['action'].split('_')[0]
+        actions = np.concatenate([result['actions'] for result in results])
+        for idx, action in enumerate(actions):
+            action_category = action.split('_')[0]
             action_category_indices[action_category].append(idx)
 
         error_name = self.mode.upper()
@@ -126,6 +137,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
 
         for action_category, indices in action_category_indices.items():
             metrics[f'{error_name}_{action_category}'] = keypoint_mpjpe(
-                pred_coords[indices], gt_coords[indices], mask[indices])
+                pred_coords[indices], gt_coords[indices], mask[indices],
+                self.ALIGNMENT[self.mode])
 
         return metrics
diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py
index cb2498560a..563264eecf 100644
--- a/mmpose/models/backbones/__init__.py
+++ b/mmpose/models/backbones/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .alexnet import AlexNet
 from .cpm import CPM
+from .dstformer import DSTFormer
 from .hourglass import HourglassNet
 from .hourglass_ae import HourglassAENet
 from .hrformer import HRFormer
@@ -33,5 +34,5 @@
     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
     'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
-    'PyramidVisionTransformerV2', 'SwinTransformer'
+    'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer'
 ]
diff --git a/mmpose/models/backbones/dstformer.py b/mmpose/models/backbones/dstformer.py
new file mode 100644
index 0000000000..2ef13bdb02
--- /dev/null
+++ b/mmpose/models/backbones/dstformer.py
@@ -0,0 +1,304 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule, constant_init
+from mmengine.model.weight_init import trunc_normal_
+
+from mmpose.registry import MODELS
+from .base_backbone import BaseBackbone
+
+
+class Attention(BaseModule):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 mode='spatial'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.mode = mode
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.attn_count_s = None
+        self.attn_count_t = None
+
+    def forward(self, x, seq_len=1):
+        B, N, C = x.shape
+
+        if self.mode == 'temporal':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
+                                      self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[
+                2]  # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_temporal(q, k, v, seq_len=seq_len)
+        elif self.mode == 'spatial':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
+                                      self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[
+                2]  # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_spatial(q, k, v)
+        else:
+            raise NotImplementedError(self.mode)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def forward_spatial(self, q, k, v):
+        B, _, N, C = q.shape
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C * self.num_heads)
+        return x
+
+    def forward_temporal(self, q, k, v, seq_len=8):
+        B, _, N, C = q.shape
+        qt = q.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+        kt = k.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+        vt = v.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+
+        attn = (qt @ kt.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ vt  # (B, H, N, T, C)
+        x = x.permute(0, 3, 2, 1, 4).reshape(B, N, C * self.num_heads)
+        return x
+
+
+class AttentionBlock(BaseModule):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 mlp_out_ratio=1.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 st_mode='st'):
+        super().__init__()
+
+        self.st_mode = st_mode
+        self.norm1_s = nn.LayerNorm(dim, eps=1e-06)
+        self.norm1_t = nn.LayerNorm(dim, eps=1e-06)
+
+        self.attn_s = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mode='spatial')
+        self.attn_t = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mode='temporal')
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2_s = nn.LayerNorm(dim, eps=1e-06)
+        self.norm2_t = nn.LayerNorm(dim, eps=1e-06)
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_out_dim = int(dim * mlp_out_ratio)
+        self.mlp_s = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim), nn.GELU(),
+            nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop))
+        self.mlp_t = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim), nn.GELU(),
+            nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop))
+
+    def forward(self, x, seq_len=1):
+        if self.st_mode == 'st':
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+        elif self.st_mode == 'ts':
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+        else:
+            raise NotImplementedError(self.st_mode)
+        return x
+
+
+@MODELS.register_module()
+class DSTFormer(BaseBackbone):
+    """Dual-stream Spatio-temporal Transformer Module.
+
+    Args:
+        in_channels (int): Number of input channels.
+        feat_size: Number of feature channels. Default: 256.
+        depth: The network depth. Default: 5.
+        num_heads: Number of heads in multi-Head self-attention blocks.
+            Default: 8.
+        mlp_ratio (int, optional): The expansion ratio of FFN. Default: 4.
+        num_keypoints: num_keypoints (int): Number of keypoints. Default: 17.
+        seq_len: The sequence length. Default: 243.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout ratio of input. Default: 0.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        att_fuse: Whether to fuse the results of attention blocks.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmpose.models import DSTFormer
+        >>> import torch
+        >>> self = DSTFormer(in_channels=3)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 2, 17, 3)
+        >>> level_outputs = self.forward(inputs)
+        >>> print(tuple(level_outputs.shape))
+        (1, 2, 17, 512)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_size=256,
+                 depth=5,
+                 num_heads=8,
+                 mlp_ratio=4,
+                 num_keypoints=17,
+                 seq_len=243,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 att_fuse=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.in_channels = in_channels
+        self.feat_size = feat_size
+
+        self.joints_embed = nn.Linear(in_channels, feat_size)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        self.blocks_st = nn.ModuleList([
+            AttentionBlock(
+                dim=feat_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                st_mode='st') for i in range(depth)
+        ])
+        self.blocks_ts = nn.ModuleList([
+            AttentionBlock(
+                dim=feat_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                st_mode='ts') for i in range(depth)
+        ])
+
+        self.norm = nn.LayerNorm(feat_size, eps=1e-06)
+
+        self.temp_embed = nn.Parameter(torch.zeros(1, seq_len, 1, feat_size))
+        self.spat_embed = nn.Parameter(
+            torch.zeros(1, num_keypoints, feat_size))
+
+        trunc_normal_(self.temp_embed, std=.02)
+        trunc_normal_(self.spat_embed, std=.02)
+
+        self.att_fuse = att_fuse
+        if self.att_fuse:
+            self.attn_regress = nn.ModuleList(
+                [nn.Linear(feat_size * 2, 2) for i in range(depth)])
+            for i in range(depth):
+                self.attn_regress[i].weight.data.fill_(0)
+                self.attn_regress[i].bias.data.fill_(0.5)
+
+    def forward(self, x):
+        if len(x.shape) == 3:
+            x = x[None, :]
+        assert len(x.shape) == 4
+
+        B, F, K, C = x.shape
+        x = x.reshape(-1, K, C)
+        BF = x.shape[0]
+        x = self.joints_embed(x)  # (BF, K, feat_size)
+        x = x + self.spat_embed
+        _, K, C = x.shape
+        x = x.reshape(-1, F, K, C) + self.temp_embed[:, :F, :, :]
+        x = x.reshape(BF, K, C)  # (BF, K, feat_size)
+        x = self.pos_drop(x)
+
+        for idx, (blk_st,
+                  blk_ts) in enumerate(zip(self.blocks_st, self.blocks_ts)):
+            x_st = blk_st(x, F)
+            x_ts = blk_ts(x, F)
+            if self.att_fuse:
+                att = self.attn_regress[idx]
+                alpha = torch.cat([x_st, x_ts], dim=-1)
+                BF, K = alpha.shape[:2]
+                alpha = att(alpha)
+                alpha = alpha.softmax(dim=-1)
+                x = x_st * alpha[:, :, 0:1] + x_ts * alpha[:, :, 1:2]
+            else:
+                x = (x_st + x_ts) * 0.5
+        x = self.norm(x)  # (BF, K, feat_size)
+        x = x.reshape(B, F, K, -1)
+        return x
+
+    def init_weights(self):
+        """Initialize the weights in backbone."""
+        super(DSTFormer, self).init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            return
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    constant_init(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m.bias, 0)
+                constant_init(m.weight, 1.0)
diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py
index e01f2269e3..ef0e17d98e 100644
--- a/mmpose/models/heads/__init__.py
+++ b/mmpose/models/heads/__init__.py
@@ -5,7 +5,8 @@
                             HeatmapHead, MSPNHead, ViPNASHead)
 from .hybrid_heads import DEKRHead, VisPredictHead
 from .regression_heads import (DSNTHead, IntegralRegressionHead,
-                               RegressionHead, RLEHead, TemporalRegressionHead,
+                               MotionRegressionHead, RegressionHead, RLEHead,
+                               TemporalRegressionHead,
                                TrajectoryRegressionHead)
 
 __all__ = [
@@ -13,5 +14,5 @@
     'RegressionHead', 'IntegralRegressionHead', 'SimCCHead', 'RLEHead',
     'DSNTHead', 'AssociativeEmbeddingHead', 'DEKRHead', 'VisPredictHead',
     'CIDHead', 'RTMCCHead', 'TemporalRegressionHead',
-    'TrajectoryRegressionHead'
+    'TrajectoryRegressionHead', 'MotionRegressionHead'
 ]
diff --git a/mmpose/models/heads/regression_heads/__init__.py b/mmpose/models/heads/regression_heads/__init__.py
index ce9cd5e1b0..729d193b51 100644
--- a/mmpose/models/heads/regression_heads/__init__.py
+++ b/mmpose/models/heads/regression_heads/__init__.py
@@ -1,16 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dsnt_head import DSNTHead
 from .integral_regression_head import IntegralRegressionHead
+from .motion_regression_head import MotionRegressionHead
 from .regression_head import RegressionHead
 from .rle_head import RLEHead
 from .temporal_regression_head import TemporalRegressionHead
 from .trajectory_regression_head import TrajectoryRegressionHead
 
 __all__ = [
-    'RegressionHead',
-    'IntegralRegressionHead',
-    'DSNTHead',
-    'RLEHead',
-    'TemporalRegressionHead',
-    'TrajectoryRegressionHead',
+    'RegressionHead', 'IntegralRegressionHead', 'DSNTHead', 'RLEHead',
+    'TemporalRegressionHead', 'TrajectoryRegressionHead',
+    'MotionRegressionHead'
 ]
diff --git a/mmpose/models/heads/regression_heads/motion_regression_head.py b/mmpose/models/heads/regression_heads/motion_regression_head.py
new file mode 100644
index 0000000000..a0037180c7
--- /dev/null
+++ b/mmpose/models/heads/regression_heads/motion_regression_head.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+from mmpose.evaluation.functional import keypoint_mpjpe
+from mmpose.registry import KEYPOINT_CODECS, MODELS
+from mmpose.utils.tensor_utils import to_numpy
+from mmpose.utils.typing import (ConfigType, OptConfigType, OptSampleList,
+                                 Predictions)
+from ..base_head import BaseHead
+
+
+@MODELS.register_module()
+class MotionRegressionHead(BaseHead):
+    """Regression head of `MotionBERT`_ by Zhu et al (2022).
+
+    Args:
+        in_channels (int): Number of input channels. Default: 256.
+        out_channels (int): Number of output channels. Default: 3.
+        embedding_size (int): Number of embedding channels. Default: 512.
+        loss (Config): Config for keypoint loss. Defaults to use
+            :class:`MSELoss`
+        decoder (Config, optional): The decoder config that controls decoding
+            keypoint coordinates from the network output. Defaults to ``None``
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+
+    .. _`MotionBERT`: https://arxiv.org/abs/2210.06551
+    """
+
+    _version = 2
+
+    def __init__(self,
+                 in_channels: int = 256,
+                 out_channels: int = 3,
+                 embedding_size: int = 512,
+                 loss: ConfigType = dict(
+                     type='MSELoss', use_target_weight=True),
+                 decoder: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.loss_module = MODELS.build(loss)
+        if decoder is not None:
+            self.decoder = KEYPOINT_CODECS.build(decoder)
+        else:
+            self.decoder = None
+
+        # Define fully-connected layers
+        self.pre_logits = nn.Sequential(
+            OrderedDict([('fc', nn.Linear(in_channels, embedding_size)),
+                         ('act', nn.Tanh())]))
+        self.fc = nn.Linear(
+            embedding_size,
+            out_channels) if embedding_size > 0 else nn.Identity()
+
+    def forward(self, feats: Tuple[Tensor]) -> Tensor:
+        """Forward the network. The input is multi scale feature maps and the
+        output is the coordinates.
+
+        Args:
+            feats (Tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            Tensor: Output coordinates (and sigmas[optional]).
+        """
+        x = feats  # (B, F, K, in_channels)
+        x = self.pre_logits(x)  # (B, F, K, embedding_size)
+        x = self.fc(x)  # (B, F, K, out_channels)
+
+        return x
+
+    def predict(self,
+                feats: Tuple[Tensor],
+                batch_data_samples: OptSampleList,
+                test_cfg: ConfigType = {}) -> Predictions:
+        """Predict results from outputs.
+
+        Returns:
+            preds (sequence[InstanceData]): Prediction results.
+                Each contains the following fields:
+
+                - keypoints: Predicted keypoints of shape (B, N, K, D).
+                - keypoint_scores: Scores of predicted keypoints of shape
+                  (B, N, K).
+        """
+
+        batch_coords = self.forward(feats)  # (B, K, D)
+
+        # Restore global position with camera_param and factor
+        camera_param = batch_data_samples[0].metainfo.get('camera_param', None)
+        if camera_param is not None:
+            w = torch.stack([
+                torch.from_numpy(np.array([b.metainfo['camera_param']['w']]))
+                for b in batch_data_samples
+            ])
+            h = torch.stack([
+                torch.from_numpy(np.array([b.metainfo['camera_param']['h']]))
+                for b in batch_data_samples
+            ])
+        else:
+            w = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+            h = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+
+        factor = batch_data_samples[0].metainfo.get('factor', None)
+        if factor is not None:
+            factor = torch.stack([
+                torch.from_numpy(b.metainfo['factor'])
+                for b in batch_data_samples
+            ])
+        else:
+            factor = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+
+        preds = self.decode((batch_coords, w, h, factor))
+
+        return preds
+
+    def loss(self,
+             inputs: Tuple[Tensor],
+             batch_data_samples: OptSampleList,
+             train_cfg: ConfigType = {}) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+
+        pred_outputs = self.forward(inputs)
+
+        lifting_target_label = torch.stack([
+            d.gt_instance_labels.lifting_target_label
+            for d in batch_data_samples
+        ])
+        lifting_target_weights = torch.stack([
+            d.gt_instance_labels.lifting_target_weights
+            for d in batch_data_samples
+        ])
+
+        # calculate losses
+        losses = dict()
+        loss = self.loss_module(pred_outputs, lifting_target_label,
+                                lifting_target_weights.unsqueeze(-1))
+
+        losses.update(loss_pose3d=loss)
+
+        # calculate accuracy
+        mpjpe_err = keypoint_mpjpe(
+            pred=to_numpy(pred_outputs),
+            gt=to_numpy(lifting_target_label),
+            mask=to_numpy(lifting_target_weights) > 0)
+
+        mpjpe_pose = torch.tensor(
+            mpjpe_err, device=lifting_target_label.device)
+        losses.update(mpjpe=mpjpe_pose)
+
+        return losses
+
+    @property
+    def default_init_cfg(self):
+        init_cfg = [dict(type='TruncNormal', layer=['Linear'], std=0.02)]
+        return init_cfg
diff --git a/mmpose/models/heads/regression_heads/temporal_regression_head.py b/mmpose/models/heads/regression_heads/temporal_regression_head.py
index ac76316842..9ed2e9f4fa 100644
--- a/mmpose/models/heads/regression_heads/temporal_regression_head.py
+++ b/mmpose/models/heads/regression_heads/temporal_regression_head.py
@@ -101,7 +101,7 @@ def predict(self,
         else:
             target_root = torch.stack([
                 torch.empty((0), dtype=torch.float32)
-                for _ in batch_data_samples[0].metainfo
+                for _ in batch_data_samples
             ])
 
         preds = self.decode((batch_coords, target_root))
diff --git a/mmpose/models/heads/regression_heads/trajectory_regression_head.py b/mmpose/models/heads/regression_heads/trajectory_regression_head.py
index adfd7353d3..a1608aaae7 100644
--- a/mmpose/models/heads/regression_heads/trajectory_regression_head.py
+++ b/mmpose/models/heads/regression_heads/trajectory_regression_head.py
@@ -101,7 +101,7 @@ def predict(self,
         else:
             target_root = torch.stack([
                 torch.empty((0), dtype=torch.float32)
-                for _ in batch_data_samples[0].metainfo
+                for _ in batch_data_samples
             ])
 
         preds = self.decode((batch_coords, target_root))
diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py
index 9a64a4adfe..b50ad99f04 100644
--- a/mmpose/models/losses/regression_loss.py
+++ b/mmpose/models/losses/regression_loss.py
@@ -365,6 +365,84 @@ def forward(self, output, target, target_weight=None):
         return loss * self.loss_weight
 
 
+@MODELS.register_module()
+class MPJPEVelocityJointLoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        lambda_scale (float): Factor of the N-MPJPE loss. Default: 0.5.
+        lambda_3d_velocity (float): Factor of the velocity loss. Default: 20.0.
+    """
+
+    def __init__(self,
+                 use_target_weight=False,
+                 loss_weight=1.,
+                 lambda_scale=0.5,
+                 lambda_3d_velocity=20.0):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+        self.lambda_scale = lambda_scale
+        self.lambda_3d_velocity = lambda_3d_velocity
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        norm_output = torch.mean(
+            torch.sum(torch.square(output), dim=-1, keepdim=True),
+            dim=-2,
+            keepdim=True)
+        norm_target = torch.mean(
+            torch.sum(target * output, dim=-1, keepdim=True),
+            dim=-2,
+            keepdim=True)
+
+        velocity_output = output[..., 1:, :, :] - output[..., :-1, :, :]
+        velocity_target = target[..., 1:, :, :] - target[..., :-1, :, :]
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            mpjpe = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+
+            nmpjpe = torch.mean(
+                torch.norm(
+                    (norm_target / norm_output * output - target) *
+                    target_weight,
+                    dim=-1))
+
+            loss_3d_velocity = torch.mean(
+                torch.norm(
+                    (velocity_output - velocity_target) * target_weight,
+                    dim=-1))
+        else:
+            mpjpe = torch.mean(torch.norm(output - target, dim=-1))
+
+            nmpjpe = torch.mean(
+                torch.norm(
+                    norm_target / norm_output * output - target, dim=-1))
+
+            loss_3d_velocity = torch.mean(
+                torch.norm(velocity_output - velocity_target, dim=-1))
+
+        loss = mpjpe + nmpjpe * self.lambda_scale + \
+            loss_3d_velocity * self.lambda_3d_velocity
+
+        return loss * self.loss_weight
+
+
 @MODELS.register_module()
 class MPJPELoss(nn.Module):
     """MPJPE (Mean Per Joint Position Error) loss.
diff --git a/tests/test_codecs/test_image_pose_lifting.py b/tests/test_codecs/test_image_pose_lifting.py
index bb94786c32..78b19ec59b 100644
--- a/tests/test_codecs/test_image_pose_lifting.py
+++ b/tests/test_codecs/test_image_pose_lifting.py
@@ -13,14 +13,18 @@ def setUp(self) -> None:
         keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256]
         keypoints = np.round(keypoints).astype(np.float32)
         keypoints_visible = np.random.randint(2, size=(1, 17))
-        lifting_target = (0.1 + 0.8 * np.random.rand(17, 3))
-        lifting_target_visible = np.random.randint(2, size=(17, ))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
         encoded_wo_sigma = np.random.rand(1, 17, 3)
 
         self.keypoints_mean = np.random.rand(17, 2).astype(np.float32)
         self.keypoints_std = np.random.rand(17, 2).astype(np.float32) + 1e-6
-        self.target_mean = np.random.rand(17, 3).astype(np.float32)
-        self.target_std = np.random.rand(17, 3).astype(np.float32) + 1e-6
+        self.target_mean = np.random.rand(1, 17, 3).astype(np.float32)
+        self.target_std = np.random.rand(1, 17, 3).astype(np.float32) + 1e-6
 
         self.data = dict(
             keypoints=keypoints,
@@ -30,7 +34,11 @@ def setUp(self) -> None:
             encoded_wo_sigma=encoded_wo_sigma)
 
     def build_pose_lifting_label(self, **kwargs):
-        cfg = dict(type='ImagePoseLifting', num_keypoints=17, root_index=0)
+        cfg = dict(
+            type='ImagePoseLifting',
+            num_keypoints=17,
+            root_index=0,
+            reshape_keypoints=False)
         cfg.update(kwargs)
         return KEYPOINT_CODECS.build(cfg)
 
@@ -50,10 +58,19 @@ def test_encode(self):
                                lifting_target_visible)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test removing root
         codec = self.build_pose_lifting_label(
@@ -63,10 +80,16 @@ def test_encode(self):
 
         self.assertTrue('target_root_removed' in encoded
                         and 'target_root_index' in encoded)
-        self.assertEqual(encoded['lifting_target_weights'].shape, (16, ))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            16,
+        ))
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (16, 3))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test normalization
         codec = self.build_pose_lifting_label(
@@ -78,7 +101,7 @@ def test_encode(self):
                                lifting_target_visible)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
 
     def test_decode(self):
         lifting_target = self.data['lifting_target']
@@ -112,12 +135,10 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test removing root
         codec = self.build_pose_lifting_label(remove_root=True)
@@ -125,12 +146,10 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test normalization
         codec = self.build_pose_lifting_label(
@@ -142,9 +161,7 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
diff --git a/tests/test_codecs/test_motionbert_label.py b/tests/test_codecs/test_motionbert_label.py
new file mode 100644
index 0000000000..01c9c654a2
--- /dev/null
+++ b/tests/test_codecs/test_motionbert_label.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from mmengine.fileio import load
+
+from mmpose.codecs import MotionBERTLabel
+from mmpose.registry import KEYPOINT_CODECS
+
+
+class TestMotionBERTLabel(TestCase):
+
+    def get_camera_param(self, imgname, camera_param) -> dict:
+        """Get camera parameters of a frame by its image name."""
+        subj, rest = osp.basename(imgname).split('_', 1)
+        action, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+        return camera_param[(subj, camera)]
+
+    def build_pose_lifting_label(self, **kwargs):
+        cfg = dict(type='MotionBERTLabel', num_keypoints=17)
+        cfg.update(kwargs)
+        return KEYPOINT_CODECS.build(cfg)
+
+    def setUp(self) -> None:
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [1000, 1002]
+        keypoints = np.round(keypoints).astype(np.float32)
+        keypoints_visible = np.random.randint(2, size=(1, 17))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
+        encoded_wo_sigma = np.random.rand(1, 17, 3)
+
+        camera_param = load('tests/data/h36m/cameras.pkl')
+        camera_param = self.get_camera_param(
+            'S1/S1_Directions_1.54138969/S1_Directions_1.54138969_000001.jpg',
+            camera_param)
+        factor = 0.1 + 5 * np.random.rand(1, )
+
+        self.data = dict(
+            keypoints=keypoints,
+            keypoints_visible=keypoints_visible,
+            lifting_target=lifting_target,
+            lifting_target_visible=lifting_target_visible,
+            camera_param=camera_param,
+            factor=factor,
+            encoded_wo_sigma=encoded_wo_sigma)
+
+    def test_build(self):
+        codec = self.build_pose_lifting_label()
+        self.assertIsInstance(codec, MotionBERTLabel)
+
+    def test_encode(self):
+        keypoints = self.data['keypoints']
+        keypoints_visible = self.data['keypoints_visible']
+        lifting_target = self.data['lifting_target']
+        lifting_target_visible = self.data['lifting_target_visible']
+        camera_param = self.data['camera_param']
+        factor = self.data['factor']
+
+        # test default settings
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param, factor)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+
+        # test concatenating visibility
+        codec = self.build_pose_lifting_label(concat_vis=True)
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param, factor)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+
+    def test_decode(self):
+        encoded_wo_sigma = self.data['encoded_wo_sigma']
+        camera_param = self.data['camera_param']
+
+        # test default settings
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(encoded_wo_sigma)
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+        # test denormalize according to image shape
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(
+            encoded_wo_sigma,
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]))
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+        # test with factor
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(
+            encoded_wo_sigma, factor=np.array([0.23]))
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+    def test_cicular_verification(self):
+        keypoints_visible = self.data['keypoints_visible']
+        lifting_target = self.data['lifting_target']
+        lifting_target_visible = self.data['lifting_target_visible']
+        camera_param = self.data['camera_param']
+
+        # test denormalize according to image shape
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        _keypoints, _ = codec.decode(
+            encoded['keypoint_labels'],
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]))
+
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :]
+
+        self.assertTrue(
+            np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2]))
+
+        # test with factor
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        _keypoints, _ = codec.decode(
+            encoded['keypoint_labels'],
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]),
+            factor=encoded['factor'])
+
+        keypoints *= encoded['factor']
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :]
+
+        self.assertTrue(
+            np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2]))
diff --git a/tests/test_codecs/test_video_pose_lifting.py b/tests/test_codecs/test_video_pose_lifting.py
index cc58292d0c..31a095e927 100644
--- a/tests/test_codecs/test_video_pose_lifting.py
+++ b/tests/test_codecs/test_video_pose_lifting.py
@@ -19,7 +19,8 @@ def get_camera_param(self, imgname, camera_param) -> dict:
         return camera_param[(subj, camera)]
 
     def build_pose_lifting_label(self, **kwargs):
-        cfg = dict(type='VideoPoseLifting', num_keypoints=17)
+        cfg = dict(
+            type='VideoPoseLifting', num_keypoints=17, reshape_keypoints=False)
         cfg.update(kwargs)
         return KEYPOINT_CODECS.build(cfg)
 
@@ -27,8 +28,12 @@ def setUp(self) -> None:
         keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256]
         keypoints = np.round(keypoints).astype(np.float32)
         keypoints_visible = np.random.randint(2, size=(1, 17))
-        lifting_target = (0.1 + 0.8 * np.random.rand(17, 3))
-        lifting_target_visible = np.random.randint(2, size=(17, ))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
         encoded_wo_sigma = np.random.rand(1, 17, 3)
 
         camera_param = load('tests/data/h36m/cameras.pkl')
@@ -61,10 +66,19 @@ def test_encode(self):
                                lifting_target_visible, camera_param)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test not zero-centering
         codec = self.build_pose_lifting_label(zero_center=False)
@@ -72,9 +86,31 @@ def test_encode(self):
                                lifting_target_visible, camera_param)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+
+        # test reshape_keypoints
+        codec = self.build_pose_lifting_label(reshape_keypoints=True)
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (34, 1))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
 
         # test removing root
         codec = self.build_pose_lifting_label(
@@ -84,10 +120,16 @@ def test_encode(self):
 
         self.assertTrue('target_root_removed' in encoded
                         and 'target_root_index' in encoded)
-        self.assertEqual(encoded['lifting_target_weights'].shape, (16, ))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            16,
+        ))
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (16, 3))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test normalizing camera
         codec = self.build_pose_lifting_label(normalize_camera=True)
@@ -102,6 +144,35 @@ def test_encode(self):
                 encoded['camera_param']['f'],
                 atol=4.))
 
+        # test with multiple targets
+        keypoints = (0.1 + 0.8 * np.random.rand(2, 17, 2)) * [192, 256]
+        keypoints = np.round(keypoints).astype(np.float32)
+        keypoints_visible = np.random.randint(2, size=(2, 17))
+        lifting_target = (0.1 + 0.8 * np.random.rand(2, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                2,
+                17,
+            ))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (2, 17, 2))
+        self.assertEqual(encoded['lifting_target_label'].shape, (2, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            2,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            2,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            2,
+            3,
+        ))
+
     def test_decode(self):
         lifting_target = self.data['lifting_target']
         encoded_wo_sigma = self.data['encoded_wo_sigma']
@@ -135,12 +206,10 @@ def test_cicular_verification(self):
                                lifting_target_visible, camera_param)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test removing root
         codec = self.build_pose_lifting_label(remove_root=True)
@@ -148,9 +217,7 @@ def test_cicular_verification(self):
                                lifting_target_visible, camera_param)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
index 88944dc11f..fd6cdf5f17 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
@@ -116,6 +116,17 @@ def test_topdown(self):
         self.assertEqual(len(dataset), 4)
         self.check_data_info_keys(dataset[0])
 
+        dataset = self.build_h36m_dataset(
+            data_mode='topdown',
+            seq_len=1,
+            seq_step=1,
+            multiple_target=1,
+            causal=False,
+            pad_video_seq=True,
+            camera_param_file='cameras.pkl')
+        self.assertEqual(len(dataset), 4)
+        self.check_data_info_keys(dataset[0])
+
         # test topdown testing with 2d keypoint detection file and
         # sequence config
         dataset = self.build_h36m_dataset(
diff --git a/tests/test_datasets/test_transforms/test_pose3d_transforms.py b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
index 5f5d5aa096..b87931bb74 100644
--- a/tests/test_datasets/test_transforms/test_pose3d_transforms.py
+++ b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
@@ -35,7 +35,7 @@ def _parse_h36m_imgname(imgname):
     scales = data['scale'].astype(np.float32)
 
     idx = 0
-    target_idx = 0
+    target_idx = [0]
 
     data_info = {
         'keypoints': keypoints[idx, :, :2].reshape(1, -1, 2),
@@ -52,7 +52,6 @@ def _parse_h36m_imgname(imgname):
         'sample_idx': idx,
         'lifting_target': keypoints_3d[target_idx, :, :3],
         'lifting_target_visible': keypoints_3d[target_idx, :, 3],
-        'target_img_path': osp.join('tests/data/h36m', imgnames[target_idx]),
     }
 
     # add camera parameters
@@ -108,9 +107,12 @@ def test_transform(self):
         tar_vis2 = results['lifting_target_visible']
 
         self.assertEqual(kpts_vis2.shape, (1, 17))
-        self.assertEqual(tar_vis2.shape, (17, ))
+        self.assertEqual(tar_vis2.shape, (
+            1,
+            17,
+        ))
         self.assertEqual(kpts2.shape, (1, 17, 2))
-        self.assertEqual(tar2.shape, (17, 3))
+        self.assertEqual(tar2.shape, (1, 17, 3))
 
         flip_indices = [
             0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13
@@ -121,12 +123,15 @@ def test_transform(self):
             self.assertTrue(
                 np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.))
             self.assertTrue(
-                np.allclose(tar1[left][1:], tar2[right][1:], atol=4.))
+                np.allclose(
+                    tar1[..., left, 1:], tar2[..., right, 1:], atol=4.))
 
             self.assertTrue(
-                np.allclose(kpts_vis1[0][left], kpts_vis2[0][right], atol=4.))
+                np.allclose(
+                    kpts_vis1[..., left], kpts_vis2[..., right], atol=4.))
             self.assertTrue(
-                np.allclose(tar_vis1[left], tar_vis2[right], atol=4.))
+                np.allclose(
+                    tar_vis1[..., left], tar_vis2[..., right], atol=4.))
 
         # test camera flipping
         transform = RandomFlipAroundRoot(
@@ -148,3 +153,23 @@ def test_transform(self):
                 -self.data_info['camera_param']['p'][0],
                 camera2['p'][0],
                 atol=4.))
+
+        # test flipping w.r.t. image
+        transform = RandomFlipAroundRoot({}, {}, flip_prob=1, flip_image=True)
+        results = deepcopy(self.data_info)
+        results = transform(results)
+        kpts2 = results['keypoints']
+        tar2 = results['lifting_target']
+
+        camera_param = results['camera_param']
+        for left, right in enumerate(flip_indices):
+            self.assertTrue(
+                np.allclose(
+                    camera_param['w'] - kpts1[0][left][:1],
+                    kpts2[0][right][:1],
+                    atol=4.))
+            self.assertTrue(
+                np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.))
+            self.assertTrue(
+                np.allclose(
+                    tar1[..., left, 1:], tar2[..., right, 1:], atol=4.))
diff --git a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
index 8289b09d0f..391b7b194a 100644
--- a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
+++ b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
@@ -20,9 +20,10 @@ def setUp(self):
         for i in range(self.batch_size):
             gt_instances = InstanceData()
             keypoints = np.random.random((1, num_keypoints, 3))
-            gt_instances.lifting_target = np.random.random((num_keypoints, 3))
+            gt_instances.lifting_target = np.random.random(
+                (1, num_keypoints, 3))
             gt_instances.lifting_target_visible = np.ones(
-                (num_keypoints, 1)).astype(bool)
+                (1, num_keypoints, 1)).astype(bool)
 
             pred_instances = InstanceData()
             pred_instances.keypoints = keypoints + np.random.normal(
@@ -32,8 +33,10 @@ def setUp(self):
             data_sample = PoseDataSample(
                 gt_instances=gt_instances, pred_instances=pred_instances)
             data_sample.set_metainfo(
-                dict(target_img_path='tests/data/h36m/S7/'
-                     'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg'))
+                dict(target_img_path=[
+                    'tests/data/h36m/S7/'
+                    'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg'
+                ]))
 
             self.data_batch.append(data)
             self.data_samples.append(data_sample.to_dict())
diff --git a/tests/test_models/test_backbones/test_dstformer.py b/tests/test_models/test_backbones/test_dstformer.py
new file mode 100644
index 0000000000..966ed6f49b
--- /dev/null
+++ b/tests/test_models/test_backbones/test_dstformer.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmpose.models.backbones import DSTFormer
+from mmpose.models.backbones.dstformer import AttentionBlock
+
+
+class TestDSTFormer(TestCase):
+
+    def test_attention_block(self):
+        # BasicTemporalBlock with causal == False
+        block = AttentionBlock(dim=256, num_heads=2)
+        x = torch.rand(2, 17, 256)
+        x_out = block(x)
+        self.assertEqual(x_out.shape, torch.Size([2, 17, 256]))
+
+    def test_DSTFormer(self):
+        # Test DSTFormer with depth=2
+        model = DSTFormer(in_channels=3, depth=2, seq_len=2)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))
+
+        # Test DSTFormer with depth=4 and qkv_bias=False
+        model = DSTFormer(in_channels=3, depth=4, seq_len=2, qkv_bias=False)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))
+
+        # Test DSTFormer with depth=4 and att_fuse=False
+        model = DSTFormer(in_channels=3, depth=4, seq_len=2, att_fuse=False)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))

From 97823685777869fbc160f5fd0a971db4934c8a10 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Fri, 14 Jul 2023 18:44:37 +0800
Subject: [PATCH 07/37] [Fix] Fix demo scripts (#2542)

---
 demo/body3d_pose_lifter_demo.py | 9 +++++----
 demo/bottomup_demo.py           | 9 +++++----
 demo/topdown_demo_with_mmdet.py | 9 +++++----
 projects/just_dance/README.md   | 2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index 840cd4edc9..9802a0a5c8 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -452,10 +452,11 @@ def main():
 
                 video_writer.write(mmcv.rgb2bgr(frame_vis))
 
-            # press ESC to exit
-            if cv2.waitKey(5) & 0xFF == 27:
-                break
-            time.sleep(args.show_interval)
+            if args.show:
+                # press ESC to exit
+                if cv2.waitKey(5) & 0xFF == 27:
+                    break
+                time.sleep(args.show_interval)
 
         video.release()
 
diff --git a/demo/bottomup_demo.py b/demo/bottomup_demo.py
index 3d6fee7a03..c4b13eff55 100644
--- a/demo/bottomup_demo.py
+++ b/demo/bottomup_demo.py
@@ -196,11 +196,12 @@ def main():
 
                 video_writer.write(mmcv.rgb2bgr(frame_vis))
 
-            # press ESC to exit
-            if cv2.waitKey(5) & 0xFF == 27:
-                break
+            if args.show:
+                # press ESC to exit
+                if cv2.waitKey(5) & 0xFF == 27:
+                    break
 
-            time.sleep(args.show_interval)
+                time.sleep(args.show_interval)
 
         if video_writer:
             video_writer.release()
diff --git a/demo/topdown_demo_with_mmdet.py b/demo/topdown_demo_with_mmdet.py
index 38f4e92e4e..53b25eaf73 100644
--- a/demo/topdown_demo_with_mmdet.py
+++ b/demo/topdown_demo_with_mmdet.py
@@ -261,11 +261,12 @@ def main():
 
                 video_writer.write(mmcv.rgb2bgr(frame_vis))
 
-            # press ESC to exit
-            if cv2.waitKey(5) & 0xFF == 27:
-                break
+            if args.show:
+                # press ESC to exit
+                if cv2.waitKey(5) & 0xFF == 27:
+                    break
 
-            time.sleep(args.show_interval)
+                time.sleep(args.show_interval)
 
         if video_writer:
             video_writer.release()
diff --git a/projects/just_dance/README.md b/projects/just_dance/README.md
index 1255996766..70390215f9 100644
--- a/projects/just_dance/README.md
+++ b/projects/just_dance/README.md
@@ -17,7 +17,7 @@ We provide a Jupyter Notebook [`just_dance_demo.ipynb`](./just_dance_demo.ipynb)
 Users can simply run the following command to generate the comparison video:
 
 ```shell
-python process_video ${TEACHER_VIDEO} ${STUDENT_VIDEO}
+python process_video.py ${TEACHER_VIDEO} ${STUDENT_VIDEO}
 ```
 
 ### Gradio

From 4f35db073952fc18e021c74f85677aee9e7a9d9c Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Mon, 17 Jul 2023 11:43:50 +0800
Subject: [PATCH 08/37] [Fix] Fix Pose3dInferencer keypoint shape bug (#2543)

---
 mmpose/apis/inferencers/pose3d_inferencer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 819273af66..bbb28ae27b 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -272,6 +272,7 @@ def preprocess_single(self,
             ),
                                                      dtype=np.float32)
             data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+            data_info['factor'] = np.zeros((T, ), dtype=np.float32)
             data_info['lifting_target_visible'] = np.ones((1, K, 1),
                                                           dtype=np.float32)
             data_info['camera_param'] = dict(w=width, h=height)
@@ -299,7 +300,6 @@ def forward(self,
             list: A list of data samples, each containing the model's output
                 results.
         """
-
         pose_lift_results = self.model.test_step(inputs)
 
         # Post-processing of pose estimation results
@@ -309,8 +309,16 @@ def forward(self,
             pose_lift_res.track_id = pose_est_results_converted[idx].get(
                 'track_id', 1e4)
 
-            # Invert x and z values of the keypoints
+            # align the shape of output keypoints coordinates and scores
             keypoints = pose_lift_res.pred_instances.keypoints
+            keypoint_scores = pose_lift_res.pred_instances.keypoint_scores
+            if keypoint_scores.ndim == 3:
+                pose_lift_results[idx].pred_instances.keypoint_scores = \
+                    np.squeeze(keypoint_scores, axis=1)
+            if keypoints.ndim == 4:
+                keypoints = np.squeeze(keypoints, axis=1)
+
+            # Invert x and z values of the keypoints
             keypoints = keypoints[..., [0, 2, 1]]
             keypoints[..., 0] = -keypoints[..., 0]
             keypoints[..., 2] = -keypoints[..., 2]

From 5e862ccc9642d8f92c0d5877c7fb6b0506316e47 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Mon, 17 Jul 2023 18:43:00 +0800
Subject: [PATCH 09/37] [Enhance] Add notifications when saving visualization
 results (#2545)

---
 demo/body3d_pose_lifter_demo.py                   |  9 +++++++++
 demo/bottomup_demo.py                             |  9 +++++++++
 demo/image_demo.py                                |  8 ++++++++
 demo/topdown_demo_with_mmdet.py                   |  9 +++++++++
 mmpose/apis/inferencers/base_mmpose_inferencer.py | 12 ++++++++++++
 mmpose/apis/inferencers/pose3d_inferencer.py      |  7 +++++++
 6 files changed, 54 insertions(+)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index 9802a0a5c8..256894fb3c 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 import mimetypes
 import os
 import time
@@ -10,6 +11,7 @@
 import mmcv
 import mmengine
 import numpy as np
+from mmengine.logging import print_log
 from mmengine.structures import InstanceData
 
 from mmpose.apis import (_track_by_iou, _track_by_oks, collect_multi_frames,
@@ -477,6 +479,13 @@ def main():
                 indent='\t')
         print(f'predictions have been saved at {args.pred_save_path}')
 
+    if save_output:
+        input_type = input_type.replace('webcam', 'video')
+        print_log(
+            f'the output {input_type} has been saved at {output_file}',
+            logger='current',
+            level=logging.INFO)
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/bottomup_demo.py b/demo/bottomup_demo.py
index c4b13eff55..b493e4c4a1 100644
--- a/demo/bottomup_demo.py
+++ b/demo/bottomup_demo.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 import mimetypes
 import os
 import time
@@ -9,6 +10,7 @@
 import mmcv
 import mmengine
 import numpy as np
+from mmengine.logging import print_log
 
 from mmpose.apis import inference_bottomup, init_model
 from mmpose.registry import VISUALIZERS
@@ -223,6 +225,13 @@ def main():
                 indent='\t')
         print(f'predictions have been saved at {args.pred_save_path}')
 
+    if output_file:
+        input_type = input_type.replace('webcam', 'video')
+        print_log(
+            f'the output {input_type} has been saved at {output_file}',
+            logger='current',
+            level=logging.INFO)
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/image_demo.py b/demo/image_demo.py
index bfbc808b1e..6a408d1760 100644
--- a/demo/image_demo.py
+++ b/demo/image_demo.py
@@ -1,7 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 from argparse import ArgumentParser
 
 from mmcv.image import imread
+from mmengine.logging import print_log
 
 from mmpose.apis import inference_topdown, init_model
 from mmpose.registry import VISUALIZERS
@@ -100,6 +102,12 @@ def main():
         show=args.show,
         out_file=args.out_file)
 
+    if args.out_file is not None:
+        print_log(
+            f'the output image has been saved at {args.out_file}',
+            logger='current',
+            level=logging.INFO)
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/topdown_demo_with_mmdet.py b/demo/topdown_demo_with_mmdet.py
index 53b25eaf73..4e39c36207 100644
--- a/demo/topdown_demo_with_mmdet.py
+++ b/demo/topdown_demo_with_mmdet.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 import mimetypes
 import os
 import time
@@ -9,6 +10,7 @@
 import mmcv
 import mmengine
 import numpy as np
+from mmengine.logging import print_log
 
 from mmpose.apis import inference_topdown
 from mmpose.apis import init_model as init_pose_estimator
@@ -288,6 +290,13 @@ def main():
                 indent='\t')
         print(f'predictions have been saved at {args.pred_save_path}')
 
+    if output_file:
+        input_type = input_type.replace('webcam', 'video')
+        print_log(
+            f'the output {input_type} has been saved at {output_file}',
+            logger='current',
+            level=logging.INFO)
+
 
 if __name__ == '__main__':
     main()
diff --git a/mmpose/apis/inferencers/base_mmpose_inferencer.py b/mmpose/apis/inferencers/base_mmpose_inferencer.py
index bed28b90d7..518a5104fa 100644
--- a/mmpose/apis/inferencers/base_mmpose_inferencer.py
+++ b/mmpose/apis/inferencers/base_mmpose_inferencer.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 import mimetypes
 import os
 import warnings
@@ -16,6 +17,7 @@
 from mmengine.fileio import (get_file_backend, isdir, join_path,
                              list_dir_or_file)
 from mmengine.infer.infer import BaseInferencer
+from mmengine.logging import print_log
 from mmengine.registry import init_default_scope
 from mmengine.runner.checkpoint import _load_checkpoint_to_model
 from mmengine.structures import InstanceData
@@ -358,6 +360,7 @@ def visualize(self,
                             file_name = os.path.basename(
                                 self.video_info['name'])
                         out_file = join_path(dir_name, file_name)
+                        self.video_info['output_file'] = out_file
                         self.video_info['writer'] = cv2.VideoWriter(
                             out_file, fourcc, self.video_info['fps'],
                             (visualization.shape[1], visualization.shape[0]))
@@ -367,6 +370,10 @@ def visualize(self,
                     file_name = file_name if file_name else img_name
                     out_file = join_path(dir_name, file_name)
                     mmcv.imwrite(out_img, out_file)
+                    print_log(
+                        f'the output image has been saved at {out_file}',
+                        logger='current',
+                        level=logging.INFO)
 
         if return_vis:
             return results
@@ -454,6 +461,11 @@ def _finalize_video_processing(
 
         # Release the video writer if it exists
         if self.video_info['writer'] is not None:
+            out_file = self.video_info['output_file']
+            print_log(
+                f'the output video has been saved at {out_file}',
+                logger='current',
+                level=logging.INFO)
             self.video_info['writer'].release()
 
         # Save predictions
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index bbb28ae27b..6afc70f62d 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 import os
 import warnings
 from collections import defaultdict
@@ -12,6 +13,7 @@
 from mmengine.config import Config, ConfigDict
 from mmengine.fileio import join_path
 from mmengine.infer.infer import ModelType
+from mmengine.logging import print_log
 from mmengine.model import revert_sync_batchnorm
 from mmengine.registry import init_default_scope
 from mmengine.structures import InstanceData
@@ -509,6 +511,7 @@ def visualize(self,
                             file_name = os.path.basename(
                                 self.video_info['name'])
                         out_file = join_path(dir_name, file_name)
+                        self.video_info['output_file'] = out_file
                         self.video_info['writer'] = cv2.VideoWriter(
                             out_file, fourcc, self.video_info['fps'],
                             (visualization.shape[1], visualization.shape[0]))
@@ -519,6 +522,10 @@ def visualize(self,
                     file_name = file_name if file_name else img_name
                     out_file = join_path(dir_name, file_name)
                     mmcv.imwrite(out_img, out_file)
+                    print_log(
+                        f'the output image has been saved at {out_file}',
+                        logger='current',
+                        level=logging.INFO)
 
         if return_vis:
             return results

From f0311df83ba19ce2ac34c77f92b696985f7642f5 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Thu, 20 Jul 2023 14:52:15 +0800
Subject: [PATCH 10/37] [Fix] MotionBERT training and flip-test (#2548)

---
 .../pose_lift/h36m/motionbert_h36m.md         |  12 +-
 .../pose_lift/h36m/motionbert_h36m.yml        |  10 +-
 ...ionbert-243frm_8xb32-120e_h36m-original.py | 137 +++++++++++++++++
 ...-lift_motionbert-243frm_8xb32-120e_h36m.py |  16 +-
 ...nbert-ft-243frm_8xb32-60e_h36m-original.py | 142 ++++++++++++++++++
 ...ift_motionbert-ft-243frm_8xb32-60e_h36m.py | 141 +++++++++++++++++
 mmpose/codecs/motionbert_label.py             |  24 +--
 .../datasets/transforms/pose3d_transforms.py  |  74 +++++----
 .../motion_regression_head.py                 |  20 ++-
 mmpose/models/losses/__init__.py              |   6 +-
 mmpose/models/pose_estimators/pose_lifter.py  |  19 ++-
 tests/test_codecs/test_motionbert_label.py    |   4 -
 .../test_transforms/test_pose3d_transforms.py |  46 ++++--
 13 files changed, 571 insertions(+), 80 deletions(-)
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
 create mode 100644 configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py

diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
index d830d65c18..93cd29eddd 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -40,14 +40,16 @@ Testing results on Human3.6M dataset with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |     35.3      |  27.7   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |     27.4      |  21.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 34.5  |     34.6      |  27.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 26.9  |     26.8      |  21.0   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
 
-Testing results on Human3.6M dataset from the [official repo](https://github.com/Walter0807/MotionBERT) with ground truth 2D detections
+Testing results on Human3.6M dataset converted from the [official repo](https://github.com/Walter0807/MotionBERT)<sup>1</sup> with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 40.5  |     39.9      |  34.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 38.2  |     37.7      |  32.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+<sup>1</sup> To test with the dataset from official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50`.
 
 *Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
index 7257fea5a6..11ab4bb382 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
@@ -15,11 +15,11 @@ Models:
   Results:
   - Dataset: Human3.6M
     Metrics:
-      MPJPE: 35.3
-      P-MPJPE: 27.7
+      MPJPE: 34.5
+      P-MPJPE: 27.1
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft_8xb32-120e_h36m.py
   In Collection: MotionBERT
   Metadata:
     Architecture: *id001
@@ -28,7 +28,7 @@ Models:
   Results:
   - Dataset: Human3.6M
     Metrics:
-      MPJPE: 27.5
-      P-MPJPE: 21.6
+      MPJPE: 26.9
+      P-MPJPE: 21.0
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
new file mode 100644
index 0000000000..032188f389
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
@@ -0,0 +1,137 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+        factor_file='annotation_body3d/fps50/h36m_factors.npy',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
index 88f6c3897d..25b9d216a2 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
@@ -32,11 +32,7 @@
 
 # codec settings
 train_codec = dict(
-    type='MotionBERTLabel',
-    num_keypoints=17,
-    concat_vis=True,
-    rootrel=True,
-    factor_label=False)
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
 val_codec = dict(
     type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
 
@@ -61,7 +57,7 @@
         loss=dict(type='MPJPEVelocityJointLoss'),
         decoder=val_codec,
     ),
-)
+    test_cfg=dict(flip_test=True))
 
 # base dataset settings
 dataset_type = 'Human36mDataset'
@@ -69,12 +65,12 @@
 
 # pipelines
 train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
     dict(
         type='RandomFlipAroundRoot',
-        keypoints_flip_cfg={},
-        target_flip_cfg={},
-        flip_image=True),
-    dict(type='GenerateTarget', encoder=train_codec),
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
     dict(
         type='PackPoseInputs',
         meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
new file mode 100644
index 0000000000..9c2aa3697a
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
@@ -0,0 +1,142 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+        'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+        factor_file='annotation_body3d/fps50/h36m_factors.npy',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py
new file mode 100644
index 0000000000..5c42e62a60
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py
@@ -0,0 +1,141 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+        'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test.npz',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py
index d0c8cd0d40..08ff4ccd1a 100644
--- a/mmpose/codecs/motionbert_label.py
+++ b/mmpose/codecs/motionbert_label.py
@@ -34,8 +34,8 @@ class MotionBERTLabel(BaseKeypointCodec):
             Default: ``False``.
         rootrel (bool): If true, the root keypoint will be set to the
             coordinate origin. Default: ``False``.
-        factor_label (bool): If true, the label will be multiplied by a factor.
-            Default: ``True``.
+        mode (str): Indicating whether the current mode is 'train' or 'test'.
+            Default: ``'test'``.
     """
 
     auxiliary_encode_keys = {
@@ -49,7 +49,7 @@ def __init__(self,
                  save_index: bool = False,
                  concat_vis: bool = False,
                  rootrel: bool = False,
-                 factor_label: bool = True):
+                 mode: str = 'test'):
         super().__init__()
 
         self.num_keypoints = num_keypoints
@@ -58,7 +58,8 @@ def __init__(self,
         self.save_index = save_index
         self.concat_vis = concat_vis
         self.rootrel = rootrel
-        self.factor_label = factor_label
+        assert mode.lower() in {'train', 'test'}
+        self.mode = mode.lower()
 
     def encode(self,
                keypoints: np.ndarray,
@@ -92,8 +93,6 @@ def encode(self,
                   shape (K, C) or (K-1, C).
                 - lifting_target_weights (np.ndarray): The target weights in
                   shape (K, ) or (K-1, ).
-                - trajectory_weights (np.ndarray): The trajectory weights in
-                  shape (K, ).
                 - factor (np.ndarray): The factor mapping camera and image
                   coordinate in shape (T, 1).
         """
@@ -104,16 +103,13 @@ def encode(self,
             lifting_target = [keypoints[..., 0, :, :]]
 
         # set initial value for `lifting_target_weights`
-        # and `trajectory_weights`
         if lifting_target_visible is None:
             lifting_target_visible = np.ones(
                 lifting_target.shape[:-1], dtype=np.float32)
             lifting_target_weights = lifting_target_visible
-            trajectory_weights = (1 / lifting_target[:, 2])
         else:
             valid = lifting_target_visible > 0.5
             lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32)
-            trajectory_weights = lifting_target_weights
 
         if camera_param is None:
             camera_param = dict()
@@ -140,6 +136,13 @@ def encode(self,
         if 'f' in _camera_param and 'c' in _camera_param:
             lifting_target_label, factor_ = camera_to_image_coord(
                 self.root_index, lifting_target_label, _camera_param)
+        if self.mode == 'train':
+            w, h = w / 1000, h / 1000
+            lifting_target_label[
+                ..., :2] = lifting_target_label[..., :2] / w * 2 - [
+                    0.001, h / w
+                ]
+            lifting_target_label[..., 2] = lifting_target_label[..., 2] / w * 2
         lifting_target_label[..., :, :] = lifting_target_label[
             ..., :, :] - lifting_target_label[...,
                                               self.root_index:self.root_index +
@@ -148,7 +151,7 @@ def encode(self,
             factor = factor_
         if factor.ndim == 1:
             factor = factor[:, None]
-        if self.factor_label:
+        if self.mode == 'test':
             lifting_target_label *= factor[..., None]
 
         if self.concat_vis:
@@ -164,7 +167,6 @@ def encode(self,
         encoded['lifting_target_weights'] = lifting_target_weights
         encoded['lifting_target'] = lifting_target_label
         encoded['lifting_target_visible'] = lifting_target_visible
-        encoded['trajectory_weights'] = trajectory_weights
         encoded['factor'] = factor
 
         return encoded
diff --git a/mmpose/datasets/transforms/pose3d_transforms.py b/mmpose/datasets/transforms/pose3d_transforms.py
index 2149d7cb30..5831692000 100644
--- a/mmpose/datasets/transforms/pose3d_transforms.py
+++ b/mmpose/datasets/transforms/pose3d_transforms.py
@@ -25,16 +25,22 @@ class RandomFlipAroundRoot(BaseTransform):
         flip_prob (float): Probability of flip. Default: 0.5.
         flip_camera (bool): Whether to flip horizontal distortion coefficients.
             Default: ``False``.
-        flip_image (bool): Whether to flip keypoints horizontally according
-            to image size. Default: ``False``.
+        flip_label (bool): Whether to flip labels instead of data.
+            Default: ``False``.
 
     Required keys:
-        keypoints
-        lifting_target
+        - keypoints or keypoint_labels
+        - lifting_target or lifting_target_label
+        - keypoints_visible or keypoint_labels_visible (optional)
+        - lifting_target_visible (optional)
+        - flip_indices (optional)
 
     Modified keys:
-        (keypoints, keypoints_visible, lifting_target, lifting_target_visible,
-        camera_param)
+        - keypoints or keypoint_labels (optional)
+        - keypoints_visible or keypoint_labels_visible (optional)
+        - lifting_target or lifting_target_label (optional)
+        - lifting_target_visible (optional)
+        - camera_param (optional)
     """
 
     def __init__(self,
@@ -42,12 +48,12 @@ def __init__(self,
                  target_flip_cfg,
                  flip_prob=0.5,
                  flip_camera=False,
-                 flip_image=False):
+                 flip_label=False):
         self.keypoints_flip_cfg = keypoints_flip_cfg
         self.target_flip_cfg = target_flip_cfg
         self.flip_prob = flip_prob
         self.flip_camera = flip_camera
-        self.flip_image = flip_image
+        self.flip_label = flip_label
 
     def transform(self, results: Dict) -> dict:
         """The transform function of :class:`RandomFlipAroundRoot`.
@@ -61,19 +67,34 @@ def transform(self, results: Dict) -> dict:
             dict: The result dict.
         """
 
-        keypoints = results['keypoints']
-        if 'keypoints_visible' in results:
-            keypoints_visible = results['keypoints_visible']
-        else:
-            keypoints_visible = np.ones(keypoints.shape[:-1], dtype=np.float32)
-        lifting_target = results['lifting_target']
-        if 'lifting_target_visible' in results:
-            lifting_target_visible = results['lifting_target_visible']
-        else:
-            lifting_target_visible = np.ones(
-                lifting_target.shape[:-1], dtype=np.float32)
-
         if np.random.rand() <= self.flip_prob:
+            if self.flip_label:
+                assert 'keypoint_labels' in results
+                assert 'lifting_target_label' in results
+                keypoints_key = 'keypoint_labels'
+                keypoints_visible_key = 'keypoint_labels_visible'
+                target_key = 'lifting_target_label'
+            else:
+                assert 'keypoints' in results
+                assert 'lifting_target' in results
+                keypoints_key = 'keypoints'
+                keypoints_visible_key = 'keypoints_visible'
+                target_key = 'lifting_target'
+
+            keypoints = results[keypoints_key]
+            if keypoints_visible_key in results:
+                keypoints_visible = results[keypoints_visible_key]
+            else:
+                keypoints_visible = np.ones(
+                    keypoints.shape[:-1], dtype=np.float32)
+
+            lifting_target = results[target_key]
+            if 'lifting_target_visible' in results:
+                lifting_target_visible = results['lifting_target_visible']
+            else:
+                lifting_target_visible = np.ones(
+                    lifting_target.shape[:-1], dtype=np.float32)
+
             if 'flip_indices' not in results:
                 flip_indices = list(range(self.num_keypoints))
             else:
@@ -81,13 +102,6 @@ def transform(self, results: Dict) -> dict:
 
             # flip joint coordinates
             _camera_param = deepcopy(results['camera_param'])
-            if self.flip_image:
-                assert 'camera_param' in results, \
-                    'Camera parameters are missing.'
-                assert 'w' in _camera_param
-                w = _camera_param['w'] / 2
-                self.keypoints_flip_cfg['center_x'] = w
-                self.target_flip_cfg['center_x'] = w
 
             keypoints, keypoints_visible = flip_keypoints_custom_center(
                 keypoints, keypoints_visible, flip_indices,
@@ -96,9 +110,9 @@ def transform(self, results: Dict) -> dict:
                 lifting_target, lifting_target_visible, flip_indices,
                 **self.target_flip_cfg)
 
-            results['keypoints'] = keypoints
-            results['keypoints_visible'] = keypoints_visible
-            results['lifting_target'] = lifting_target
+            results[keypoints_key] = keypoints
+            results[keypoints_visible_key] = keypoints_visible
+            results[target_key] = lifting_target
             results['lifting_target_visible'] = lifting_target_visible
 
             # flip horizontal distortion coefficients
diff --git a/mmpose/models/heads/regression_heads/motion_regression_head.py b/mmpose/models/heads/regression_heads/motion_regression_head.py
index a0037180c7..3870e3c59e 100644
--- a/mmpose/models/heads/regression_heads/motion_regression_head.py
+++ b/mmpose/models/heads/regression_heads/motion_regression_head.py
@@ -7,6 +7,7 @@
 from torch import Tensor, nn
 
 from mmpose.evaluation.functional import keypoint_mpjpe
+from mmpose.models.utils.tta import flip_coordinates
 from mmpose.registry import KEYPOINT_CODECS, MODELS
 from mmpose.utils.tensor_utils import to_numpy
 from mmpose.utils.typing import (ConfigType, OptConfigType, OptSampleList,
@@ -95,7 +96,24 @@ def predict(self,
                   (B, N, K).
         """
 
-        batch_coords = self.forward(feats)  # (B, K, D)
+        if test_cfg.get('flip_test', False):
+            # TTA: flip test -> feats = [orig, flipped]
+            assert isinstance(feats, list) and len(feats) == 2
+            flip_indices = batch_data_samples[0].metainfo['flip_indices']
+            _feats, _feats_flip = feats
+            _batch_coords = self.forward(_feats)
+            _batch_coords_flip = torch.stack([
+                flip_coordinates(
+                    _batch_coord_flip,
+                    flip_indices=flip_indices,
+                    shift_coords=test_cfg.get('shift_coords', True),
+                    input_size=(1, 1))
+                for _batch_coord_flip in self.forward(_feats_flip)
+            ],
+                                             dim=0)
+            batch_coords = (_batch_coords + _batch_coords_flip) * 0.5
+        else:
+            batch_coords = self.forward(feats)
 
         # Restore global position with camera_param and factor
         camera_param = batch_data_samples[0].metainfo.get('camera_param', None)
diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py
index f21071e156..523e4df133 100644
--- a/mmpose/models/losses/__init__.py
+++ b/mmpose/models/losses/__init__.py
@@ -4,7 +4,8 @@
 from .heatmap_loss import (AdaptiveWingLoss, KeypointMSELoss,
                            KeypointOHKMMSELoss)
 from .loss_wrappers import CombinedLoss, MultipleLossWrapper
-from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss,
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss,
+                              MPJPEVelocityJointLoss, MSELoss, RLELoss,
                               SemiSupervisionLoss, SmoothL1Loss,
                               SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss)
 
@@ -13,5 +14,6 @@
     'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
     'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss',
     'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss', 'CombinedLoss',
-    'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss'
+    'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss',
+    'MPJPEVelocityJointLoss'
 ]
diff --git a/mmpose/models/pose_estimators/pose_lifter.py b/mmpose/models/pose_estimators/pose_lifter.py
index 5bad3dde3c..ec8401d1a2 100644
--- a/mmpose/models/pose_estimators/pose_lifter.py
+++ b/mmpose/models/pose_estimators/pose_lifter.py
@@ -2,9 +2,11 @@
 from itertools import zip_longest
 from typing import Tuple, Union
 
+import torch
 from torch import Tensor
 
 from mmpose.models.utils import check_and_update_config
+from mmpose.models.utils.tta import flip_coordinates
 from mmpose.registry import MODELS
 from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType,
                                  Optional, OptMultiConfig, OptSampleList,
@@ -244,7 +246,22 @@ def predict(self, inputs: Tensor, data_samples: SampleList) -> SampleList:
         assert self.with_head, (
             'The model must have head to perform prediction.')
 
-        feats = self.extract_feat(inputs)
+        if self.test_cfg.get('flip_test', False):
+            flip_indices = data_samples[0].metainfo['flip_indices']
+            _feats = self.extract_feat(inputs)
+            _feats_flip = self.extract_feat(
+                torch.stack([
+                    flip_coordinates(
+                        _input,
+                        flip_indices=flip_indices,
+                        shift_coords=self.test_cfg.get('shift_coords', True),
+                        input_size=(1, 1)) for _input in inputs
+                ],
+                            dim=0))
+
+            feats = [_feats, _feats_flip]
+        else:
+            feats = self.extract_feat(inputs)
 
         pose_preds, batch_pred_instances, batch_pred_fields = None, None, None
         traj_preds, batch_traj_instances, batch_traj_fields = None, None, None
diff --git a/tests/test_codecs/test_motionbert_label.py b/tests/test_codecs/test_motionbert_label.py
index 01c9c654a2..a42b3d0793 100644
--- a/tests/test_codecs/test_motionbert_label.py
+++ b/tests/test_codecs/test_motionbert_label.py
@@ -73,10 +73,6 @@ def test_encode(self):
             1,
             17,
         ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (
-            1,
-            17,
-        ))
 
         # test concatenating visibility
         codec = self.build_pose_lifting_label(concat_vis=True)
diff --git a/tests/test_datasets/test_transforms/test_pose3d_transforms.py b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
index b87931bb74..c057dba4e7 100644
--- a/tests/test_datasets/test_transforms/test_pose3d_transforms.py
+++ b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
@@ -154,22 +154,46 @@ def test_transform(self):
                 camera2['p'][0],
                 atol=4.))
 
-        # test flipping w.r.t. image
-        transform = RandomFlipAroundRoot({}, {}, flip_prob=1, flip_image=True)
-        results = deepcopy(self.data_info)
-        results = transform(results)
-        kpts2 = results['keypoints']
-        tar2 = results['lifting_target']
+        # test label flipping
+        self.data_info['keypoint_labels'] = kpts1
+        self.data_info['keypoint_labels_visible'] = kpts_vis1
+        self.data_info['lifting_target_label'] = tar1
+
+        transform = RandomFlipAroundRoot(
+            self.keypoints_flip_cfg,
+            self.target_flip_cfg,
+            flip_prob=1,
+            flip_label=True)
+        results = transform(deepcopy(self.data_info))
+
+        kpts2 = results['keypoint_labels']
+        kpts_vis2 = results['keypoint_labels_visible']
+        tar2 = results['lifting_target_label']
+        tar_vis2 = results['lifting_target_visible']
 
-        camera_param = results['camera_param']
+        self.assertEqual(kpts_vis2.shape, (1, 17))
+        self.assertEqual(tar_vis2.shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(kpts2.shape, (1, 17, 2))
+        self.assertEqual(tar2.shape, (1, 17, 3))
+
+        flip_indices = [
+            0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13
+        ]
         for left, right in enumerate(flip_indices):
             self.assertTrue(
-                np.allclose(
-                    camera_param['w'] - kpts1[0][left][:1],
-                    kpts2[0][right][:1],
-                    atol=4.))
+                np.allclose(-kpts1[0][left][:1], kpts2[0][right][:1], atol=4.))
             self.assertTrue(
                 np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.))
             self.assertTrue(
                 np.allclose(
                     tar1[..., left, 1:], tar2[..., right, 1:], atol=4.))
+
+            self.assertTrue(
+                np.allclose(
+                    kpts_vis1[..., left], kpts_vis2[..., right], atol=4.))
+            self.assertTrue(
+                np.allclose(
+                    tar_vis1[..., left], tar_vis2[..., right], atol=4.))

From 5a9487d6878eb9f7e8220094752e5cf276e8b730 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Thu, 20 Jul 2023 14:53:00 +0800
Subject: [PATCH 11/37] [Docs] Enhance docs (#2555)

---
 docs/en/installation.md                  |  18 ++
 docs/en/user_guides/configs.md           |  34 +-
 docs/en/user_guides/train_and_test.md    |  27 +-
 docs/zh_cn/installation.md               |  18 ++
 docs/zh_cn/user_guides/configs.md        |  69 +++--
 docs/zh_cn/user_guides/train_and_test.md | 377 ++++++++++++++++++++++-
 6 files changed, 499 insertions(+), 44 deletions(-)

diff --git a/docs/en/installation.md b/docs/en/installation.md
index 47db25bb5f..d5f1cb0a74 100644
--- a/docs/en/installation.md
+++ b/docs/en/installation.md
@@ -68,6 +68,15 @@ Note that some of the demo scripts in MMPose require [MMDetection](https://githu
 mim install "mmdet>=3.1.0"
 ```
 
+```{note}
+Here are the version correspondences between mmdet, mmpose and mmcv:
+
+- mmdet 2.x <=> mmpose 0.x <=> mmcv 1.x
+- mmdet 3.x <=> mmpose 1.x <=> mmcv 2.x
+
+If you encounter version incompatibility issues, please check the correspondence using `pip list | grep mm` and upgrade or downgrade the dependencies accordingly.
+```
+
 ## Best Practices
 
 ### Build MMPose from source
@@ -141,6 +150,15 @@ The `demo.jpg` can be downloaded from [Github](https://raw.githubusercontent.com
 
 The inference results will be a list of `PoseDataSample`, and the predictions are in the `pred_instances`, indicating the detected keypoint locations and scores.
 
+```{note}
+MMCV version should match PyTorch version strictly. If you encounter the following issues:
+
+- No module named 'mmcv.ops'
+- No module named 'mmcv._ext'
+
+It means that the current PyTorch version does not match the CUDA version. You can check the CUDA version using `nvidia-smi`, and it should match the `+cu1xx` in PyTorch version in `pip list | grep torch`. Otherwise, you need to uninstall PyTorch and reinstall it, then reinstall MMCV (the installation order **CAN NOT** be swapped).
+```
+
 ## Customize Installation
 
 ### CUDA versions
diff --git a/docs/en/user_guides/configs.md b/docs/en/user_guides/configs.md
index 9d2c44f7ff..bc4d9110a3 100644
--- a/docs/en/user_guides/configs.md
+++ b/docs/en/user_guides/configs.md
@@ -114,11 +114,22 @@ Here is the description of General configuration:
 # General
 default_scope = 'mmpose'
 default_hooks = dict(
-    timer=dict(type='IterTimerHook'), # time the data processing and model inference
-    logger=dict(type='LoggerHook', interval=50), # interval to print logs
-    param_scheduler=dict(type='ParamSchedulerHook'), # update lr
+    # time the data processing and model inference
+    timer=dict(type='IterTimerHook'),
+    # interval to print logs，50 iters by default
+    logger=dict(type='LoggerHook', interval=50),
+    # update lr according to the lr scheduler
+    param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(
-        type='CheckpointHook', interval=1, save_best='coco/AP', # interval to save ckpt
+        # interval to save ckpt
+        # e.g.
+        # save_best='coco/AP' means save the best ckpt according to coco/AP of CocoMetric
+        # save_best='PCK' means save the best ckpt according to PCK of PCKAccuracy
+        type='CheckpointHook', interval=1, save_best='coco/AP',
+
+        # rule to judge the metric
+        # 'greater' means the larger the better
+        # 'less' means the smaller the better
         rule='greater'), # rule to judge the metric
     sampler_seed=dict(type='DistSamplerSeedHook')) # set the distributed seed
 env_cfg = dict(
@@ -135,23 +146,16 @@ log_processor = dict( # Format, interval to log
 log_level = 'INFO' # The level of logging
 ```
 
+```{note}
+We now support two visualizer backends: LocalVisBackend and TensorboardVisBackend, the former is for local visualization and the latter is for Tensorboard visualization. You can choose according to your needs. See [Train and Test](./train_and_test.md) for details.
+```
+
 General configuration is stored alone in the `$MMPOSE/configs/_base_`, and inherited by doing:
 
 ```Python
 _base_ = ['../../../_base_/default_runtime.py'] # take the config file as the starting point of the relative path
 ```
 
-```{note}
-CheckpointHook:
-
-- save_best: `'coco/AP'` for `CocoMetric`, `'PCK'` for `PCKAccuracy`
-- max_keep_ckpts: the maximum checkpoints to keep. Defaults to -1, which means unlimited.
-
-Example:
-
-`default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))`
-```
-
 ### Data
 
 Data configuration refers to the data processing related settings, mainly including:
diff --git a/docs/en/user_guides/train_and_test.md b/docs/en/user_guides/train_and_test.md
index 6bcc88fc3b..6f7d7c63b1 100644
--- a/docs/en/user_guides/train_and_test.md
+++ b/docs/en/user_guides/train_and_test.md
@@ -14,7 +14,6 @@ python tools/train.py ${CONFIG_FILE} [ARGS]
 
 ```{note}
 By default, MMPose prefers GPU to CPU. If you want to train a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program.
-
 ```
 
 ```shell
@@ -214,6 +213,31 @@ python ./tools/train.py \
 
 - `randomness.deterministic=True`, set the deterministic option for `cuDNN` backend, i.e., set `torch.backends.cudnn.deterministic` to `True` and `torch.backends.cudnn.benchmark` to `False`. Defaults to `False`. See [Pytorch Randomness](https://pytorch.org/docs/stable/notes/randomness.html) for more details.
 
+## Training Log
+
+During training, the training log will be printed in the console as follows:
+
+```shell
+07/14 08:26:50 - mmengine - INFO - Epoch(train) [38][ 6/38]  base_lr: 5.148343e-04 lr: 5.148343e-04  eta: 0:15:34  time: 0.540754  data_time: 0.394292  memory: 3141  loss: 0.006220  loss_kpt: 0.006220  acc_pose: 1.000000
+```
+
+The training log contains the following information:
+
+- `07/14 08:26:50`: The current time.
+- `mmengine`: The name of the program.
+- `INFO` or `WARNING`: The log level.
+- `Epoch(train)`: The current training stage. `train` means the training stage, `val` means the validation stage.
+- `[38][ 6/38]`: The current epoch and the current iteration.
+- `base_lr`: The base learning rate.
+- `lr`: The current (real) learning rate.
+- `eta`: The estimated time of arrival.
+- `time`: The elapsed time (minutes) of the current iteration.
+- `data_time`: The elapsed time (minutes) of data processing (i/o and transforms).
+- `memory`: The GPU memory (MB) allocated by the program.
+- `loss`: The total loss value of the current iteration.
+- `loss_kpt`: The loss value you passed in head module.
+- `acc_pose`: The accuracy value you passed in head module.
+
 ## Visualize training process
 
 Monitoring the training process is essential for understanding the performance of your model and making necessary adjustments. In this section, we will introduce two methods to visualize the training process of your MMPose model: TensorBoard and the MMEngine Visualizer.
@@ -261,7 +285,6 @@ python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
 
 ```{note}
 By default, MMPose prefers GPU to CPU. If you want to test a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program.
-
 ```
 
 ```shell
diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/installation.md
index ef515c8030..47faa0b5f7 100644
--- a/docs/zh_cn/installation.md
+++ b/docs/zh_cn/installation.md
@@ -66,6 +66,15 @@ mim install "mmcv>=2.0.1"
 mim install "mmdet>=3.1.0"
 ```
 
+```{note}
+新旧版本 mmpose、mmdet、mmcv 的对应关系为：
+
+- mmdet 2.x <=> mmpose 0.x <=> mmcv 1.x
+- mmdet 3.x <=> mmpose 1.x <=> mmcv 2.x
+
+如果遇到版本不兼容的问题，请使用 `pip list | grep mm` 检查对应关系后，升级或降级相关依赖。
+```
+
 ## 最佳实践
 
 根据具体需求，我们支持两种安装模式: 从源码安装（推荐）和作为 Python 包安装
@@ -141,6 +150,15 @@ results = inference_topdown(model, 'demo.jpg')
 示例图片 `demo.jpg` 可以从 [Github](https://raw.githubusercontent.com/open-mmlab/mmpose/main/tests/data/coco/000000000785.jpg) 下载。
 推理结果是一个 `PoseDataSample` 列表，预测结果将会保存在 `pred_instances` 中，包括检测到的关键点位置和置信度。
 
+```{note}
+MMCV 版本与 PyTorch 版本需要严格对应，如果遇到如下问题：
+
+- No module named 'mmcv.ops'
+- No module named 'mmcv._ext'
+
+说明当前环境中的 PyTorch 版本与 CUDA 版本不匹配。你可以通过 `nvidia-smi` 查看 CUDA 版本，需要与 `pip list | grep torch` 中 PyTorch 的 `+cu1xx` 对应，否则，你需要先卸载 PyTorch 并重新安装，然后重新安装 MMCV（这里的安装顺序**不可以**交换）。
+```
+
 ## 自定义安装
 
 ### CUDA 版本
diff --git a/docs/zh_cn/user_guides/configs.md b/docs/zh_cn/user_guides/configs.md
index 0bcb7aa1a8..808b1916bd 100644
--- a/docs/zh_cn/user_guides/configs.md
+++ b/docs/zh_cn/user_guides/configs.md
@@ -1,4 +1,4 @@
-# 配置文件
+# 如何看懂配置文件
 
 MMPose 使用 Python 文件作为配置文件，将模块化设计和继承设计结合到配置系统中，便于进行各种实验。
 
@@ -119,42 +119,61 @@ python tools/analysis/print_config.py /PATH/TO/CONFIG
 # 通用配置
 default_scope = 'mmpose'
 default_hooks = dict(
-    timer=dict(type='IterTimerHook'), # 迭代时间统计，包括数据耗时和模型耗时
-    logger=dict(type='LoggerHook', interval=50), # 日志打印间隔
-    param_scheduler=dict(type='ParamSchedulerHook'), # 用于调度学习率更新
+    # 迭代时间统计，包括数据耗时和模型耗时
+    timer=dict(type='IterTimerHook'),
+
+    # 日志打印间隔，默认每 50 iters 打印一次
+    logger=dict(type='LoggerHook', interval=50),
+
+    # 用于调度学习率更新的 Hook
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
     checkpoint=dict(
-        type='CheckpointHook', interval=1, save_best='coco/AP', # ckpt保存间隔，最优ckpt参考指标
-        rule='greater'), # 最优ckpt指标评价规则
-    sampler_seed=dict(type='DistSamplerSeedHook')) # 分布式随机种子设置
+        # ckpt 保存间隔，最优 ckpt 参考指标。
+        # 例如：
+        # save_best='coco/AP' 代表以 coco/AP 作为最优指标，对应 CocoMetric 评测器的 AP 指标
+        # save_best='PCK' 代表以 PCK 作为最优指标，对应 PCKAccuracy 评测器的 PCK 指标
+        # 更多指标请前往 mmpose/evaluation/metrics/
+        type='CheckpointHook', interval=1, save_best='coco/AP',
+
+        # 最优 ckpt 保留规则，greater 代表越大越好，less 代表越小越好
+        rule='greater'),
+
+    # 分布式随机种子设置 Hook
+    sampler_seed=dict(type='DistSamplerSeedHook'))
 env_cfg = dict(
-    cudnn_benchmark=False, # cudnn benchmark开关
-    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # opencv多线程配置
-    dist_cfg=dict(backend='nccl')) # 分布式训练后端设置
-vis_backends = [dict(type='LocalVisBackend')] # 可视化器后端设置
-visualizer = dict( # 可视化器设置
+    # cudnn benchmark 开关，用于加速训练，但会增加显存占用
+    cudnn_benchmark=False,
+
+    # opencv 多线程配置，用于加速数据加载，但会增加显存占用
+    # 默认为 0，代表使用单线程
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # 分布式训练后端设置，支持 nccl 和 gloo
+    dist_cfg=dict(backend='nccl'))
+
+# 可视化器后端设置，默认为本地可视化
+vis_backends = [dict(type='LocalVisBackend')]
+
+# 可视化器设置
+visualizer = dict(
     type='PoseLocalVisualizer',
     vis_backends=[dict(type='LocalVisBackend')],
     name='visualizer')
 log_processor = dict( # 训练日志格式、间隔
     type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
-log_level = 'INFO' # 日志记录等级
-```
-
-通用配置一般单独存放到`$MMPOSE/configs/_base_`目录下，通过如下方式进行继承：
-
-```Python
-_base_ = ['../../../_base_/default_runtime.py'] # 以运行时的config文件位置为相对路径起点
+# 日志记录等级，INFO 代表记录训练日志，WARNING 代表只记录警告信息，ERROR 代表只记录错误信息
+log_level = 'INFO'
 ```
 
 ```{note}
-CheckpointHook:
-
-- save_best: `'coco/AP'` 用于 `CocoMetric`, `'PCK'` 用于 `PCKAccuracy`
-- max_keep_ckpts: 最大保留ckpt数量，默认为-1，代表不限制
+可视化器后端设置支持 LocalVisBackend 和 TensorboardVisBackend，前者用于本地可视化，后者用于 Tensorboard 可视化，你可以根据需要进行选择。详情见 [训练与测试](./train_and_test.md) 的 【可视化训练进程】。
+```
 
-样例:
+通用配置一般单独存放到 `$MMPOSE/configs/_base_` 目录下，通过如下方式进行继承：
 
-`default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))`
+```Python
+_base_ = ['../../../_base_/default_runtime.py'] # 以运行时的config文件位置为相对路径起点
 ```
 
 ### 数据配置
diff --git a/docs/zh_cn/user_guides/train_and_test.md b/docs/zh_cn/user_guides/train_and_test.md
index 452eddc928..7c30a856cf 100644
--- a/docs/zh_cn/user_guides/train_and_test.md
+++ b/docs/zh_cn/user_guides/train_and_test.md
@@ -1,5 +1,378 @@
 # 训练与测试
 
-中文内容建设中，暂时请查阅[英文版文档](../../en/user_guides/train_and_test.md)
+## 启动训练
 
-如果您愿意参与中文文档的翻译与维护，我们团队将十分感谢您的贡献！欢迎加入我们的社区群与我们取得联系，或直接按照 [如何给 MMPose 贡献代码](../contribution_guide.md) 在 GitHub 上提交 Pull Request。
+### 本地训练
+
+你可以使用 `tools/train.py` 在单机上使用 CPU 或单个 GPU 训练模型。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+
+```{note}
+默认情况下，MMPose 会优先使用 GPU 而不是 CPU。如果你想在 CPU 上训练模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1，使 GPU 对程序不可见。
+```
+
+```shell
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+
+| ARGS                                  | Description                                                                                                                                                         |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件路径                                                                                                                                                        |
+| `--work-dir WORK_DIR`                 | 训练日志与 checkpoint 存放目录，默认使用配置文件名作为目录存放在 `./work_dirs` 下                                                                                   |
+| `--resume [RESUME]`                   | 恢复训练，可以从指定 checkpoint 进行重启，不指定则会使用最近一次的 checkpoint                                                                                       |
+| `--amp`                               | 开启混合精度训练                                                                                                                                                    |
+| `--no-validate`                       | **不建议新手开启**。 训练中不进行评测                                                                                                                               |
+| `--auto-scale-lr`                     | 自动根据当前设置的实际 batch size 和配置文件中的标准 batch size 进行学习率缩放                                                                                      |
+| `--cfg-options CFG_OPTIONS`           | 对当前配置文件中的一些设置进行临时覆盖，字典 key-value 格式为 xxx=yyy。如果需要覆盖的值是一个数组，格式应当为 `key="[a,b]"` 或 `key=a,b`。也允许使用元组，如 `key="[(a,b),(c,d)]"`。注意双引号是**必须的**，且**不允许**使用空格。 |
+| `--show-dir SHOW_DIR`                 | 验证阶段生成的可视化图片存放路径                                                                                                                                    |
+| `--show`                              | 使用窗口显示预测的可视化结果                                                                                                                                        |
+| `--interval INTERVAL`                 | 进行可视化的间隔（每隔多少张图可视化一张）                                                                                                                          |
+| `--wait-time WAIT_TIME`               | 可视化显示时每张图片的持续时间（单位：秒），默认为 1                                                                                                                |
+| `--launcher {none,pytorch,slurm,mpi}` | 可选的启动器                                                                                                                                                        |
+
+### 多卡训练
+
+我们提供了一个脚本来使用 `torch.distributed.launch` 启动多卡训练。
+
+```shell
+bash ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [PY_ARGS]
+```
+
+| ARGS          | Description                                        |
+| ------------- | -------------------------------------------------- |
+| `CONFIG_FILE` | 配置文件路径                                       |
+| `GPU_NUM`     | 使用 GPU 数量                                      |
+| `[PYARGS]`    | 其他配置项 `tools/train.py`, 见 [这里](#本地训练). |
+
+你也可以通过环境变量来指定启动器的额外参数。例如，通过以下命令将启动器的通信端口改为 29666：
+
+```shell
+PORT=29666 bash ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [PY_ARGS]
+```
+
+如果你想同时启动多个训练任务并使用不同的 GPU，你可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash ./tools/dist_train.sh ${CONFIG_FILE1} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=29501 bash ./tools/dist_train.sh ${CONFIG_FILE2} 4 [PY_ARGS]
+```
+
+### 分布式训练
+
+#### 局域网多机训练
+
+如果你使用以太网连接的多台机器启动训练任务，你可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+相比于单机多卡，你需要指定一些额外的环境变量：
+
+| 环境变量      | 描述                       |
+| ------------- | -------------------------- |
+| `NNODES`      | 机器总数                   |
+| `NODE_RANK`   | 当前机器序号               |
+| `PORT`        | 通信端口，所有机器必须相同 |
+| `MASTER_ADDR` | 主机地址，所有机器必须相同 |
+
+通常情况下，如果你没有像 InfiniBand 这样的高速网络，那么训练速度会很慢。
+
+#### Slurm 多机训练
+
+如果你在一个使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMPose，你可以使用 `slurm_train.sh` 脚本。
+
+```shell
+[ENV_VARS] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} [PY_ARGS]
+```
+
+脚本参数说明：
+
+| 参数          | 描述                                               |
+| ------------- | -------------------------------------------------- |
+| `PARTITION`   | 指定集群分区                                       |
+| `JOB_NAME`    | 任务名，可以任取                                   |
+| `CONFIG_FILE` | 配置文件路径                                       |
+| `WORK_DIR`    | 训练日志存储路径                                   |
+| `[PYARGS]`    | 其他配置项 `tools/train.py`, 见 [这里](#本地训练). |
+
+以下是可以用来配置 slurm 任务的环境变量：
+
+| 环境变量        | 描述                                                                     |
+| --------------- | ------------------------------------------------------------------------ |
+| `GPUS`          | GPU 总数，默认为 8                                                       |
+| `GPUS_PER_NODE` | 每台机器使用的 GPU 总数，默认为 8                                        |
+| `CPUS_PER_TASK` | 每个任务分配的 CPU 总数（通常为 1 张 GPU 对应 1 个任务进程），默认为 5   |
+| `SRUN_ARGS`     | `srun` 的其他参数，可选项见 [这里](https://slurm.schedmd.com/srun.html). |
+
+## 恢复训练
+
+恢复训练意味着从之前的训练中保存的状态继续训练，其中状态包括模型权重、优化器状态和优化器参数调整策略的状态。
+
+### 自动恢复
+
+用户可以在训练命令的末尾添加 `--resume` 来恢复训练。程序会自动从 `work_dirs` 中加载最新的权重文件来恢复训练。如果 `work_dirs` 中有最新的 `checkpoint`（例如在之前的训练中中断了训练），则会从 `checkpoint` 处恢复训练。否则（例如之前的训练没有及时保存 `checkpoint` 或者启动了一个新的训练任务），则会重新开始训练。
+
+以下是一个恢复训练的例子：
+
+```shell
+python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --resume
+```
+
+### 指定 checkpoint 恢复
+
+你可以在 `load_from` 中指定 `checkpoint` 的路径，MMPose 会自动读取 `checkpoint` 并从中恢复训练。命令如下：
+
+```shell
+python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py \
+    --resume work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth
+```
+
+如果你希望在配置文件中手动指定 `checkpoint` 路径，除了设置 `resume=True`，还需要设置 `load_from`。
+
+需要注意的是，如果只设置了 `load_from` 而没有设置 `resume=True`，那么只会加载 `checkpoint` 中的权重，而不会从之前的状态继续训练。
+
+以下的例子与上面指定 `--resume` 参数的例子等价：
+
+```Python
+resume = True
+load_from = 'work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth'
+# model settings
+model = dict(
+    ## omitted ##
+    )
+```
+
+## 在训练中冻结部分参数
+
+在某些场景下，我们可能希望在训练过程中冻结模型的某些参数，以便微调特定部分或防止过拟合。在 MMPose 中，你可以通过在 `paramwise_cfg` 中设置 `custom_keys` 来为模型中的任何模块设置不同的超参数。这样可以让你控制模型特定部分的学习率和衰减系数。
+
+例如，如果你想冻结 `backbone.layer0` 和 `backbone.layer1` 的所有参数，你可以在配置文件中添加以下内容：
+
+```Python
+optim_wrapper = dict(
+    optimizer=dict(...),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+            'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+        }))
+```
+
+以上配置将会通过将学习率和衰减系数设置为 0 来冻结 `backbone.layer0` 和 `backbone.layer1` 中的参数。通过这种方式，你可以有效地控制训练过程，并根据需要微调模型的特定部分。
+
+## 自动混合精度训练（AMP）
+
+混合精度训练可以减少训练时间和存储需求，而不改变模型或降低模型训练精度，从而支持更大的 batch size、更大的模型和更大的输入尺寸。
+
+要启用自动混合精度（AMP）训练，请在训练命令的末尾添加 `--amp`，如下所示：
+
+```shell
+python tools/train.py ${CONFIG_FILE} --amp
+```
+
+具体例子如下：
+
+```shell
+python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py  --amp
+```
+
+## 设置随机种子
+
+如果你想指定随机种子，你可以通过如下命令：
+
+```shell
+python ./tools/train.py \
+    ${CONFIG} \                               # 配置文件
+    --cfg-options randomness.seed=2023 \      # 设置 random seed = 2023
+    [randomness.diff_rank_seed=True] \        # 不同 rank 的进程使用不同的随机种子
+    [randomness.deterministic=True]           # 设置 cudnn.deterministic=True
+# `[]` 表示可选的参数，你不需要输入 `[]`
+```
+
+`randomness` 还有三个参数可以设置，具体含义如下。
+
+- `randomness.seed=2023`，将随机种子设置为 `2023`。
+
+- `randomness.diff_rank_seed=True`，根据全局 `rank` 设置不同的随机种子。默认为 `False`。
+
+- `randomness.deterministic=True`，设置 `cuDNN` 后端的确定性选项，即将 `torch.backends.cudnn.deterministic` 设置为 `True`，将 `torch.backends.cudnn.benchmark` 设置为 `False`。默认为 `False`。更多细节请参考 [Pytorch Randomness](https://pytorch.org/docs/stable/notes/randomness.html)。
+
+## 训练日志说明
+
+在训练中，命令行会实时打印训练日志如下：
+
+```shell
+07/14 08:26:50 - mmengine - INFO - Epoch(train) [38][ 6/38]  base_lr: 5.148343e-04 lr: 5.148343e-04  eta: 0:15:34  time: 0.540754  data_time: 0.394292  memory: 3141  loss: 0.006220  loss_kpt: 0.006220  acc_pose: 1.000000
+```
+
+以上训练日志包括如下内容：
+
+- `07/14 08:26:50`：当前时间
+- `mmengine`：日志前缀，表示日志来自 MMEngine
+- `INFO` or `WARNING`：日志级别，表示该日志为普通信息
+- `Epoch(train)`：当前处于训练阶段，如果处于验证阶段，则为 `Epoch(val)`
+- `[38][ 6/38]`：当前处于第 38 个 epoch，当前 batch 为第 6 个 batch，总共有 38 个 batch
+- `base_lr`：基础学习率
+- `lr`：当前实际使用的学习率
+- `eta`：预计训练剩余时间
+- `time`：当前 batch 的训练时间（单位：分钟）
+- `data_time`：当前 batch 的数据加载（i/o，数据增强）时间（单位：分钟）
+- `memory`：当前进程占用的显存（单位：MB）
+- `loss`：当前 batch 的总 loss
+- `loss_kpt`：当前 batch 的关键点 loss
+- `acc_pose`：当前 batch 的姿态准确率
+
+## 可视化训练进程
+
+监视训练过程对于了解模型的性能并进行必要的调整至关重要。在本节中，我们将介绍两种可视化训练过程的方法：TensorBoard 和 MMEngine Visualizer。
+
+### TensorBoard
+
+TensorBoard 是一个强大的工具，可以让你可视化训练过程中的 loss 变化。要启用 TensorBoard 可视化，你可能需要：
+
+1. 安装 TensorBoard
+
+   ```shell
+   pip install tensorboard
+   ```
+
+2. 在配置文件中开启 TensorBoard 作为可视化后端：
+
+   ```python
+   visualizer = dict(vis_backends=[
+       dict(type='LocalVisBackend'),
+       dict(type='TensorboardVisBackend'),
+   ])
+   ```
+
+Tensorboard 生成的 event 文件会保存在实验日志文件夹 `${WORK_DIR}` 下，该文件夹默认为 `work_dir/${CONFIG}`，你也可以通过 `--work-dir` 参数指定。要可视化训练过程，请使用以下命令：
+
+```shell
+tensorboard --logdir ${WORK_DIR}/${TIMESTAMP}/vis_data
+```
+
+### MMEngine Visualizer
+
+MMPose 还支持在验证过程中可视化模型的推理结果。要启用此功能，请在启动训练时使用 `--show` 选项或设置 `--show-dir`。这个功能提供了一种有效的方法来分析模型在特定示例上的性能并进行必要的调整。
+
+## 测试
+
+### 本地测试
+
+你可以使用 `tools/test.py` 在单机上使用 CPU 或单个 GPU 测试模型。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+
+```{note}
+默认情况下，MMPose 会优先使用 GPU 而不是 CPU。如果你想在 CPU 上测试模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1，使 GPU 对程序不可见。
+```
+
+```shell
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+
+| ARGS                                  | Description                                                                                                                                                         |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件路径file.                                                                                                                                                   |
+| `CHECKPOINT_FILE`                     | checkpoint 文件路径，可以是本地文件，也可以是网络链接。 [这里](https://MMPose.readthedocs.io/en/latest/model_zoo.html) 是 MMPose 提供的 checkpoint 列表.            |
+| `--work-dir WORK_DIR`                 | 评测结果存储目录                                                                                                                                                    |
+| `--out OUT`                           | 评测结果存放文件                                                                                                                                                    |
+| `--dump DUMP`                         | 导出评测时的模型输出，用于用户自行离线评测                                                                                                                          |
+| `--cfg-options CFG_OPTIONS`           | 对当前配置文件中的一些设置进行临时覆盖，字典 key-value 格式为 xxx=yyy。如果需要覆盖的值是一个数组，格式应当为 `key="[a,b]"` 或 `key=a,b`。也允许使用元组，如 `key="[(a,b),(c,d)]"`。注意双引号是**必须的**，且**不允许**使用空格。 |
+| `--show-dir SHOW_DIR`                 | T验证阶段生成的可视化图片存放路径                                                                                                                                   |
+| `--show`                              | 使用窗口显示预测的可视化结果                                                                                                                                        |
+| `--interval INTERVAL`                 | 进行可视化的间隔（每隔多少张图可视化一张）                                                                                                                          |
+| `--wait-time WAIT_TIME`               | 可视化显示时每张图片的持续时间（单位：秒），默认为 1                                                                                                                |
+| `--launcher {none,pytorch,slurm,mpi}` | 可选的启动器                                                                                                                                                        |
+
+### 多卡测试
+
+我们提供了一个脚本来使用 `torch.distributed.launch` 启动多卡测试。
+
+```shell
+bash ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [PY_ARGS]
+```
+
+| ARGS              | Description                                                                                                                                             |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`     | 配置文件路径                                                                                                                                            |
+| `CHECKPOINT_FILE` | checkpoint 文件路径，可以是本地文件，也可以是网络链接。 [这里](https://MMPose.readthedocs.io/en/latest/model_zoo.html) 是 MMPose 提供的 checkpoint 列表 |
+| `GPU_NUM`         | 使用 GPU 数量                                                                                                                                           |
+| `[PYARGS]`        | 其他配置项 `tools/test.py`, 见 [这里](#本地测试)                                                                                                        |
+
+你也可以通过环境变量来指定启动器的额外参数。例如，通过以下命令将启动器的通信端口改为 29666：
+
+```shell
+PORT=29666 bash ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [PY_ARGS]
+```
+
+如果你想同时启动多个测试任务并使用不同的 GPU，你可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash ./tools/dist_test.sh ${CONFIG_FILE1} ${CHECKPOINT_FILE} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash ./tools/dist_test.sh ${CONFIG_FILE2} ${CHECKPOINT_FILE} 4 [PY_ARGS]
+```
+
+### 分布式测试
+
+#### 局域网多机测试
+
+如果你使用以太网连接的多台机器启动测试任务，你可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT_FILE $GPUS
+```
+
+在第二台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT_FILE $GPUS
+```
+
+相比于单机多卡，你需要指定一些额外的环境变量：
+
+| 环境变量      | 描述                       |
+| ------------- | -------------------------- |
+| `NNODES`      | 机器总数                   |
+| `NODE_RANK`   | 当前机器序号               |
+| `PORT`        | 通信端口，所有机器必须相同 |
+| `MASTER_ADDR` | 主机地址，所有机器必须相同 |
+
+通常情况下，如果你没有像 InfiniBand 这样的高速网络，那么测试速度会很慢。
+
+#### Slurm 多机测试
+
+如果你在一个使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMPose，你可以使用 `slurm_test.sh` 脚本。
+
+```shell
+[ENV_VARS] ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${CHECKPOINT_FILE} [PY_ARGS]
+```
+
+脚本参数说明：
+
+| 参数              | 描述                                                                                                                                                    |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `PARTITION`       | 指定集群分区                                                                                                                                            |
+| `JOB_NAME`        | 任务名，可以任取                                                                                                                                        |
+| `CONFIG_FILE`     | 配置文件路径                                                                                                                                            |
+| `CHECKPOINT_FILE` | checkpoint 文件路径，可以是本地文件，也可以是网络链接。 [这里](https://MMPose.readthedocs.io/en/latest/model_zoo.html) 是 MMPose 提供的 checkpoint 列表 |
+| `[PYARGS]`        | 其他配置项 `tools/test.py`, 见 [这里](#本地测试)                                                                                                        |
+
+以下是可以用来配置 slurm 任务的环境变量：
+
+| 环境变量        | 描述                                                                     |
+| --------------- | ------------------------------------------------------------------------ |
+| `GPUS`          | GPU 总数，默认为 8                                                       |
+| `GPUS_PER_NODE` | 每台机器使用的 GPU 总数，默认为 8                                        |
+| `CPUS_PER_TASK` | 每个任务分配的 CPU 总数（通常为 1 张 GPU 对应 1 个任务进程），默认为 5   |
+| `SRUN_ARGS`     | `srun` 的其他参数，可选项见 [这里](https://slurm.schedmd.com/srun.html). |

From b225a773d168fc2afd48cde5f76c0202d1ba2f52 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Thu, 20 Jul 2023 19:08:31 +0800
Subject: [PATCH 12/37] [Docs] Fix links in doc (#2557)

---
 docs/en/user_guides/configs.md              |   7 +-
 docs/zh_cn/user_guides/advanced_training.md | 104 --------------------
 docs/zh_cn/user_guides/configs.md           |   8 +-
 docs/zh_cn/user_guides/useful_tools.md      |   5 -
 docs/zh_cn/user_guides/visualization.md     |   5 -
 5 files changed, 6 insertions(+), 123 deletions(-)
 delete mode 100644 docs/zh_cn/user_guides/advanced_training.md
 delete mode 100644 docs/zh_cn/user_guides/useful_tools.md
 delete mode 100644 docs/zh_cn/user_guides/visualization.md

diff --git a/docs/en/user_guides/configs.md b/docs/en/user_guides/configs.md
index bc4d9110a3..b5861ae029 100644
--- a/docs/en/user_guides/configs.md
+++ b/docs/en/user_guides/configs.md
@@ -234,10 +234,9 @@ test_dataloader = val_dataloader # use val as test by default
 
 ```{note}
 Common Usages:
-- [Resume training](../common_usages/resume_training.md)
-- [Automatic mixed precision (AMP) training](../common_usages/amp_training.md)
-- [Set the random seed](../common_usages/set_random_seed.md)
-
+- [Resume training](https://mmpose.readthedocs.io/en/dev-1.x/user_guides/train_and_test.html#resume-training)
+- [Automatic mixed precision (AMP) training](https://mmpose.readthedocs.io/en/dev-1.x/user_guides/train_and_test.html#automatic-mixed-precision-amp-training)
+- [Set the random seed](https://mmpose.readthedocs.io/en/dev-1.x/user_guides/train_and_test.html#set-the-random-seed)
 ```
 
 ### Training
diff --git a/docs/zh_cn/user_guides/advanced_training.md b/docs/zh_cn/user_guides/advanced_training.md
deleted file mode 100644
index dd02a7661f..0000000000
--- a/docs/zh_cn/user_guides/advanced_training.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# 高级训练设置
-
-## 恢复训练
-
-恢复训练是指从之前某次训练保存下来的状态开始继续训练，这里的状态包括模型的权重、优化器和优化器参数调整策略的状态。
-
-### 自动恢复训练
-
-用户可以在训练命令最后加上 `--resume` 恢复训练，程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。如果 `work_dir` 中有最新的 `checkpoint`（例如该训练在上一次训练时被中断），则会从该 `checkpoint` 恢复训练，否则（例如上一次训练还没来得及保存 `checkpoint` 或者启动了新的训练任务）会重新开始训练。
-
-下面是一个恢复训练的示例:
-
-```shell
-python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --resume
-```
-
-### 指定 Checkpoint 恢复训练
-
-你也可以对 `--resume` 指定 `checkpoint` 路径，MMPose 会自动读取该 `checkpoint` 并从中恢复训练，命令如下：
-
-```shell
-python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py \
-    --resume work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth
-```
-
-如果你希望手动在配置文件中指定 `checkpoint` 路径，除了设置 `resume=True`，还需要设置 `load_from` 参数。需要注意的是，如果只设置了 `load_from` 而没有设置 `resume=True`，则只会加载 `checkpoint` 中的权重并重新开始训练，而不是接着之前的状态继续训练。
-
-下面的例子与上面指定 `--resume` 参数的例子等价：
-
-```python
-resume = True
-load_from = 'work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth'
-# model settings
-model = dict(
-    ## 内容省略 ##
-    )
-```
-
-## 自动混合精度（AMP）训练
-
-混合精度训练在不改变模型、不降低模型训练精度的前提下，可以缩短训练时间，降低存储需求，因而能支持更大的 batch size、更大模型和尺寸更大的输入的训练。
-
-如果要开启自动混合精度（AMP）训练，在训练命令最后加上 --amp 即可， 命令如下：
-
-```shell
-python tools/train.py ${CONFIG_FILE} --amp
-```
-
-具体例子如下：
-
-```shell
-python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py  --amp
-```
-
-## 设置随机种子
-
-如果想要在训练时指定随机种子，可以使用以下命令：
-
-```shell
-python ./tools/train.py \
-    ${CONFIG} \                               # 配置文件路径
-    --cfg-options randomness.seed=2023 \      # 设置随机种子为 2023
-    [randomness.diff_rank_seed=True] \        # 根据 rank 来设置不同的种子。
-    [randomness.deterministic=True]           # 把 cuDNN 后端确定性选项设置为 True
-# [] 代表可选参数，实际输入命令行时，不用输入 []
-```
-
-randomness 有三个参数可设置，具体含义如下：
-
-- `randomness.seed=2023` ，设置随机种子为 `2023`。
-
-- `randomness.diff_rank_seed=True`，根据 `rank` 来设置不同的种子，`diff_rank_seed` 默认为 `False`。
-
-- `randomness.deterministic=True`，把 `cuDNN` 后端确定性选项设置为 `True`，即把 `torch.backends.cudnn.deterministic` 设为 `True`，把 `torch.backends.cudnn.benchmark` 设为 `False`。`deterministic` 默认为 `False`。更多细节见 [Pytorch Randomness](https://pytorch.org/docs/stable/notes/randomness.html)。
-
-如果你希望手动在配置文件中指定随机种子，可以在配置文件中设置 `random_seed` 参数，具体如下：
-
-```python
-randomness = dict(seed=2023)
-# model settings
-model = dict(
-    ## 内容省略 ##
-    )
-```
-
-## 使用 Tensorboard 可视化训练过程
-
-安装 Tensorboard 环境
-
-```shell
-pip install tensorboard
-```
-
-在 config 文件中添加 tensorboard 配置
-
-```python
-visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])
-```
-
-运行训练命令后，tensorboard 文件会生成在可视化文件夹 `work_dir/${CONFIG}/${TIMESTAMP}/vis_data` 下，运行下面的命令就可以在网页链接使用 tensorboard 查看 loss、学习率和精度等信息。
-
-```shell
-tensorboard --logdir work_dir/${CONFIG}/${TIMESTAMP}/vis_data
-```
diff --git a/docs/zh_cn/user_guides/configs.md b/docs/zh_cn/user_guides/configs.md
index 808b1916bd..c24d8c7b3e 100644
--- a/docs/zh_cn/user_guides/configs.md
+++ b/docs/zh_cn/user_guides/configs.md
@@ -253,12 +253,10 @@ test_dataloader = val_dataloader # 默认情况下不区分验证集和测试集
 ```
 
 ```{note}
-
 常用功能可以参考以下教程:
-- [恢复训练](../common_usages/resume_training.md)
-- [自动混合精度训练](../common_usages/amp_training.md)
-- [设置随机种子](../common_usages/set_random_seed.md)
-
+- [恢复训练](https://mmpose.readthedocs.io/zh_CN/dev-1.x/user_guides/train_and_test.html#id7)
+- [自动混合精度训练](https://mmpose.readthedocs.io/zh_CN/dev-1.x/user_guides/train_and_test.html#amp)
+- [设置随机种子](https://mmpose.readthedocs.io/zh_CN/dev-1.x/user_guides/train_and_test.html#id10)
 ```
 
 ### 训练配置
diff --git a/docs/zh_cn/user_guides/useful_tools.md b/docs/zh_cn/user_guides/useful_tools.md
deleted file mode 100644
index f2ceb771b7..0000000000
--- a/docs/zh_cn/user_guides/useful_tools.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# 常用工具
-
-中文内容建设中，暂时请查阅[英文版文档](../../en/user_guides/useful_tools.md)
-
-如果您愿意参与中文文档的翻译与维护，我们团队将十分感谢您的贡献！欢迎加入我们的社区群与我们取得联系，或直接按照 [如何给 MMPose 贡献代码](../contribution_guide.md) 在 GitHub 上提交 Pull Request。
diff --git a/docs/zh_cn/user_guides/visualization.md b/docs/zh_cn/user_guides/visualization.md
deleted file mode 100644
index a584eb450e..0000000000
--- a/docs/zh_cn/user_guides/visualization.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# 可视化
-
-中文内容建设中，暂时请查阅[英文版文档](../../en/user_guides/visualization.md)
-
-如果您愿意参与中文文档的翻译与维护，我们团队将十分感谢您的贡献！欢迎加入我们的社区群与我们取得联系，或直接按照 [如何给 MMPose 贡献代码](../contribution_guide.md) 在 GitHub 上提交 Pull Request。

From 947f013c435f0aa0697e01210cc25aae0c8ddc10 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Fri, 21 Jul 2023 01:56:34 +0800
Subject: [PATCH 13/37] [Docs] add details (#2558)

---
 docs/en/advanced_guides/customize_datasets.md |  4 +-
 docs/en/guide_to_framework.md                 | 62 +++++++++++++---
 docs/en/user_guides/configs.md                | 19 +++++
 .../advanced_guides/customize_datasets.md     |  4 +-
 docs/zh_cn/guide_to_framework.md              | 70 ++++++++++++++-----
 docs/zh_cn/overview.md                        | 10 ---
 docs/zh_cn/user_guides/configs.md             | 19 +++++
 7 files changed, 149 insertions(+), 39 deletions(-)

diff --git a/docs/en/advanced_guides/customize_datasets.md b/docs/en/advanced_guides/customize_datasets.md
index 1aac418812..68efb1b0c2 100644
--- a/docs/en/advanced_guides/customize_datasets.md
+++ b/docs/en/advanced_guides/customize_datasets.md
@@ -77,8 +77,8 @@ An example of the dataset config is as follows.
 1. `name`: the keypoint name. The keypoint name must be unique.
 2. `id`: the keypoint id.
 3. `color`: (\[B, G, R\]) is used for keypoint visualization.
-4. `type`: 'upper' or 'lower', will be used in data augmentation.
-5. `swap`: indicates the 'swap pair' (also known as 'flip pair'). When applying image horizontal flip, the left part will become the right part. We need to flip the keypoints accordingly.
+4. `type`: 'upper' or 'lower', will be used in data augmentation [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/b225a773d168fc2afd48cde5f76c0202d1ba2f52/mmpose/datasets/transforms/common_transforms.py#L263).
+5. `swap`: indicates the 'swap pair' (also known as 'flip pair'). When applying image horizontal flip, the left part will become the right part, used in data augmentation [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94). We need to flip the keypoints accordingly.
 
 `skeleton_info` contains information about the keypoint connectivity, which is used for visualization.
 
diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
index fceb37a142..1de2e68678 100644
--- a/docs/en/guide_to_framework.md
+++ b/docs/en/guide_to_framework.md
@@ -17,6 +17,7 @@ This  tutorial covers what developers will concern when using MMPose 1.0:
 The content of this tutorial is organized as follows:
 
 - [A 20 Minute Guide to MMPose Framework](#a-20-minute-guide-to-mmpose-framework)
+  - [Structure](#structure)
   - [Overview](#overview)
   - [Step1: Configs](#step1-configs)
   - [Step2: Data](#step2-data)
@@ -33,6 +34,47 @@ The content of this tutorial is organized as follows:
     - [Neck](#neck)
     - [Head](#head)
 
+## Structure
+
+The file structure of MMPose 1.0 is as follows:
+
+```shell
+mmpose
+|----apis
+|----structures
+|----datasets
+     |----transforms
+|----codecs
+|----models
+     |----pose_estimators
+     |----data_preprocessors
+     |----backbones
+     |----necks
+     |----heads
+     |----losses
+|----engine
+     |----hooks
+|----evaluation
+|----visualization
+```
+
+- **apis** provides high-level APIs for model inference
+- **structures** provides data structures like bbox, keypoint and PoseDataSample
+- **datasets** supports various datasets for pose estimation
+  - **transforms** contains a lot of useful data augmentation transforms
+- **codecs** provides pose encoders and decoders: an encoder encodes poses (mostly keypoints) into learning targets (e.g. heatmaps), and a decoder decodes model outputs into pose predictions
+- **models** provides all components of pose estimation models in a modular structure
+  - **pose_estimators** defines all pose estimation model classes
+  - **data_preprocessors** is for preprocessing the input data of the model
+  - **backbones** provides a collection of backbone networks
+  - **necks** contains various neck modules
+  - **heads** contains various prediction heads that perform pose estimation
+  - **losses** contains various loss functions
+- **engine** provides runtime components related to pose estimation
+  - **hooks** provides various hooks of the runner
+- **evaluation** provides metrics for evaluating model performance
+- **visualization** is for visualizing skeletons, heatmaps and other information
+
 ## Overview
 
 ![overall-en](https://user-images.githubusercontent.com/13503330/187372008-2a94bad5-5252-4155-9ae3-3da1c426f569.png)
@@ -62,9 +104,7 @@ Note that all new modules need to be registered using `Registry` and imported in
 The organization of data in MMPose contains:
 
 - Dataset Meta Information
-
 - Dataset
-
 - Pipeline
 
 ### Dataset Meta Information
@@ -264,6 +304,10 @@ When supporting MPII dataset, since we need to use `head_size` to calculate `PCK
 
 To support a dataset that is beyond the scope of [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py), you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details.
 
+```{note}
+If you wish to customize a new dataset, you can refer to [Customize Datasets](./advanced_guides/customize_datasets.md) for more details.
+```
+
 ### Pipeline
 
 Data augmentations and transformations during pre-processing are organized as a pipeline. Here is an example of typical pipelines：
@@ -306,21 +350,21 @@ In MMPose, the modules used for data transformation are under `[$MMPOSE/mmpose/d
 
 #### i. Augmentation
 
-Commonly used transforms are defined in [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py), such as `RandomFlip`, `RandomHalfBody`, etc.
+Commonly used transforms are defined in [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py), such as [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94), [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263), etc.
 
-For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by `RandomBBoxTransform`**.** For bottom-up methods, `BottomupRandomAffine` is used.
+For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433). For bottom-up methods, [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) is used.
 
 ```{note}
-Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obtained by `GetBBoxCenterScale`.
+Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obtained by [GetBBoxCenterScale](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L31).
 ```
 
 #### ii. Transformation
 
-Affine transformation is used to convert images and annotations from the original image space to the input space. This is done by `TopdownAffine` for top-down methods and `BottomupRandomAffine` for bottom-up methods.
+Affine transformation is used to convert images and annotations from the original image space to the input space. This is done by [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) for top-down methods and [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) for bottom-up methods.
 
 #### iii. Encoding
 
-In training phase, after the data is transformed from the original image space into the input space, it is necessary to use `GenerateTarget` to obtain the training target(e.g. Gaussian Heatmaps). We name this process **Encoding**. Conversely, the process of getting the corresponding coordinates from Gaussian Heatmaps is called **Decoding**.
+In training phase, after the data is transformed from the original image space into the input space, it is necessary to use [GenerateTarget](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L873) to obtain the training target(e.g. Gaussian Heatmaps). We name this process **Encoding**. Conversely, the process of getting the corresponding coordinates from Gaussian Heatmaps is called **Decoding**.
 
 In MMPose, we collect Encoding and Decoding processes into a **Codec**, in which `encode()` and `decode()` are implemented.
 
@@ -360,7 +404,7 @@ If you wish to customize a new codec, you can refer to [Codec](./user_guides/cod
 
 After the data is transformed, you need to pack it using [PackPoseInputs](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py).
 
-This method converts the data stored in the dictionary `results` into standard data structures in MMPose, such as `InstanceData`, `PixelData`, `PoseDataSample`, etc.
+This method converts the data stored in the dictionary `results` into standard data structures in MMPose, such as `InstanceData`, `PixelData`, [PoseDataSample](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/structures/pose_data_sample.py), etc.
 
 Specifically, we divide the data into `gt` (ground-truth) and `pred` (prediction), each of which has the following types:
 
@@ -368,7 +412,7 @@ Specifically, we divide the data into `gt` (ground-truth) and `pred` (prediction
 - **instance_labels**(torch.tensor): instance-level training labels (e.g. normalized coordinates, keypoint visibility) in the output scale space
 - **fields**(torch.tensor): pixel-level training labels or predictions (e.g. Gaussian Heatmaps) in the output scale space
 
-The following is an example of the implementation of `PoseDataSample` under the hood:
+The following is an example of the implementation of [PoseDataSample](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/structures/pose_data_sample.py) under the hood:
 
 ```Python
 def get_pose_data_sample(self):
diff --git a/docs/en/user_guides/configs.md b/docs/en/user_guides/configs.md
index b5861ae029..6ad9357cd7 100644
--- a/docs/en/user_guides/configs.md
+++ b/docs/en/user_guides/configs.md
@@ -2,6 +2,25 @@
 
 We use python files as configs and incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
 
+## Structure
+
+The file structure of configs is as follows:
+
+```shell
+configs
+|----_base_
+     |----datasets
+     |----default_runtime.py
+|----animal_2d_keypoint
+|----body_2d_keypoint
+|----body_3d_keypoint
+|----face_2d_keypoint
+|----fashion_2d_keypoint
+|----hand_2d_keypoint
+|----hand_3d_keypoint
+|----wholebody_2d_keypoint
+```
+
 ## Introduction
 
 MMPose is equipped with a powerful config system. Cooperating with Registry, a config file can organize all the configurations in the form of python dictionaries and create instances of the corresponding modules.
diff --git a/docs/zh_cn/advanced_guides/customize_datasets.md b/docs/zh_cn/advanced_guides/customize_datasets.md
index 61b58dc929..2ff16bf9d0 100644
--- a/docs/zh_cn/advanced_guides/customize_datasets.md
+++ b/docs/zh_cn/advanced_guides/customize_datasets.md
@@ -88,8 +88,8 @@ config/_base_/datasets/custom.py
   1. `name`: 关键点名称，必须是唯一的，例如 `nose`、`left_eye` 等。
   2. `id`: 关键点 ID，必须是唯一的，从 0 开始。
   3. `color`: 关键点可视化时的颜色，以 (\[B, G, R\]) 格式组织起来，用于可视化。
-  4. `type`: 关键点类型，可以是 `upper`、`lower` 或 \`\`，用于数据增强。
-  5. `swap`: 关键点交换关系，用于水平翻转数据增强。
+  4. `type`: 关键点类型，可以是 `upper`、`lower` 或 `''`，用于数据增强 [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/b225a773d168fc2afd48cde5f76c0202d1ba2f52/mmpose/datasets/transforms/common_transforms.py#L263)。
+  5. `swap`: 关键点交换关系，用于水平翻转数据增强 [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94)。
 - `skeleton_info`：骨架连接关系，用于可视化。
 - `joint_weights`：每个关键点的权重，用于损失函数计算。
 - `sigma`：标准差，用于计算 OKS 分数，详细信息请参考 [keypoints-eval](https://cocodataset.org/#keypoints-eval)。
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index b4c44192a5..a440b4871d 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -19,6 +19,7 @@ MMPose 1.0 采用了全新的模块结构设计以精简代码，提升运行效
 以下是这篇教程的目录：
 
 - [20 分钟了解 MMPose 架构设计](#20-分钟了解-mmpose-架构设计)
+  - [文件结构](#文件结构)
   - [总览](#总览)
   - [Step1：配置文件](#step1配置文件)
   - [Step2：数据](#step2数据)
@@ -35,6 +36,47 @@ MMPose 1.0 采用了全新的模块结构设计以精简代码，提升运行效
     - [颈部模块（Neck）](#颈部模块neck)
     - [预测头（Head）](#预测头head)
 
+## 文件结构
+
+MMPose 1.0 的文件结构如下所示：
+
+```shell
+mmpose
+|----apis
+|----structures
+|----datasets
+     |----transforms
+|----codecs
+|----models
+     |----pose_estimators
+     |----data_preprocessors
+     |----backbones
+     |----necks
+     |----heads
+     |----losses
+|----engine
+     |----hooks
+|----evaluation
+|----visualization
+```
+
+- **apis** 提供用于模型推理的高级 API
+- **structures** 提供 bbox、keypoint 和 PoseDataSample 等数据结构
+- **datasets** 支持用于姿态估计的各种数据集
+  - **transforms** 包含各种数据增强变换
+- **codecs** 提供姿态编解码器：编码器用于将姿态信息（通常为关键点坐标）编码为模型学习目标（如热力图），解码器则用于将模型输出解码为姿态估计结果
+- **models** 以模块化结构提供了姿态估计模型的各类组件
+  - **pose_estimators** 定义了所有姿态估计模型类
+  - **data_preprocessors** 用于预处理模型的输入数据
+  - **backbones** 包含各种骨干网络
+  - **necks** 包含各种模型颈部组件
+  - **heads** 包含各种模型头部
+  - **losses** 包含各种损失函数
+- **engine** 包含与姿态估计任务相关的运行时组件
+  - **hooks** 提供运行时的各种钩子
+- **evaluation** 提供各种评估模型性能的指标
+- **visualization** 用于可视化关键点骨架和热力图等信息
+
 ## 总览
 
 ![overall-cn](https://user-images.githubusercontent.com/13503330/187830967-f2d7bf40-6261-42f3-91a5-ae045fa0dc0c.png)
@@ -262,6 +304,10 @@ class MpiiDataset(BaseCocoStyleDataset):
 
 如果自定义数据集无法被 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) 支持，你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。
 
+```{note}
+如果你想自定义数据集，请参考 [自定义数据集](./advanced_guides/customize_datasets.md)。
+```
+
 ### 数据流水线
 
 一个典型的数据流水线配置如下：
@@ -304,46 +350,38 @@ test_pipeline = [
 
 #### i. 数据增强
 
-数据增强中常用的变换存放在 [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py) 中，如 `RandomFlip`、`RandomHalfBody` 等。
+数据增强中常用的变换存放在 [$MMPOSE/mmpose/datasets/transforms/common_transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py) 中，如 [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94)、[RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L263) 等。
 
-对于 top-down 方法，`Shift`、`Rotate`、`Resize` 操作由 `RandomBBoxTransform`来实现；对于 bottom-up 方法，这些则是由 `BottomupRandomAffine` 实现。
+对于 top-down 方法，`Shift`、`Rotate`、`Resize` 操作由 [RandomBBoxTransform](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L433) 来实现；对于 bottom-up 方法，这些则是由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 实现。
 
 ```{note}
-值得注意的是，大部分数据变换都依赖于 `bbox_center` 和 `bbox_scale`，它们可以通过 `GetBBoxCenterScale` 来得到。
+值得注意的是，大部分数据变换都依赖于 `bbox_center` 和 `bbox_scale`，它们可以通过 [GetBBoxCenterScale](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L31) 来得到。
 ```
 
 #### ii. 数据变换
 
-我们使用仿射变换，将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 `TopdownAffine` 完成，在 bottom-up 方法中则由 `BottomupRandomAffine` 完成。
+我们使用仿射变换，将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 [TopdownAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/topdown_transforms.py#L14) 完成，在 bottom-up 方法中则由 [BottomupRandomAffine](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/bottomup_transforms.py#L134) 完成。
 
 #### iii. 数据编码
 
-在模型训练时，数据从原始空间变换到输入图片空间后，需要使用 `GenerateTarget` 来生成训练所需的监督目标（比如用坐标值生成高斯热图），我们将这一过程称为编码（Encode），反之，通过高斯热图得到对应坐标值的过程称为解码（Decode）。
+在模型训练时，数据从原始空间变换到输入图片空间后，需要使用 [GenerateTarget](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L873) 来生成训练所需的监督目标（比如用坐标值生成高斯热图），我们将这一过程称为编码（Encode），反之，通过高斯热图得到对应坐标值的过程称为解码（Decode）。
 
 在 MMPose 中，我们将编码和解码过程集合成一个编解码器（Codec），在其中实现 `encode()` 和 `decode()`。
 
 目前 MMPose 支持生成以下类型的监督目标：
 
 - `heatmap`: 高斯热图
-
 - `keypoint_label`: 关键点标签（如归一化的坐标值）
-
 - `keypoint_xy_label`: 单个坐标轴关键点标签
-
 - `heatmap+keypoint_label`: 同时生成高斯热图和关键点标签
-
 - `multiscale_heatmap`: 多尺度高斯热图
 
 生成的监督目标会按以下关键字进行封装：
 
 - `heatmaps`：高斯热图
-
 - `keypoint_labels`：关键点标签（如归一化的坐标值）
-
 - `keypoint_x_labels`：x 轴关键点标签
-
 - `keypoint_y_labels`：y 轴关键点标签
-
 - `keypoint_weights`：关键点权重
 
 ```Python
@@ -374,9 +412,9 @@ class GenerateTarget(BaseTransform):
 
 #### iv. 数据打包
 
-数据经过前处理变换后，最终需要通过 `PackPoseInputs` 打包成数据样本。该操作定义在 [$MMPOSE/mmpose/datasets/transforms/formatting.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py) 中。
+数据经过前处理变换后，最终需要通过 [PackPoseInputs](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/formatting.py) 打包成数据样本。
 
-打包过程会将数据流水线中用字典 `results` 存储的数据转换成用 MMPose 所需的标准数据结构， 如 `InstanceData`，`PixelData`，`PoseDataSample` 等。
+打包过程会将数据流水线中用字典 `results` 存储的数据转换成用 MMPose 所需的标准数据结构， 如 `InstanceData`，`PixelData`，[PoseDataSample](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/structures/pose_data_sample.py) 等。
 
 具体而言，我们将数据样本内容分为 `gt`（标注真值） 和 `pred`（模型预测）两部分，它们都包含以下数据项：
 
@@ -386,7 +424,7 @@ class GenerateTarget(BaseTransform):
 
 - **fields**(torch.tensor)：像素级别的训练标签（如高斯热图）或预测结果，属于输出尺度空间
 
-下面是 `PoseDataSample` 底层实现的例子：
+下面是 [PoseDataSample](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/structures/pose_data_sample.py) 底层实现的例子：
 
 ```Python
 def get_pose_data_sample(self):
diff --git a/docs/zh_cn/overview.md b/docs/zh_cn/overview.md
index a790cd3be2..7ecd429862 100644
--- a/docs/zh_cn/overview.md
+++ b/docs/zh_cn/overview.md
@@ -11,30 +11,20 @@ MMPose 是一款基于 Pytorch 的姿态估计开源工具箱，是 OpenMMLab 
 MMPose 由 **8** 个主要部分组成，apis、structures、datasets、codecs、models、engine、evaluation 和 visualization。
 
 - **apis** 提供用于模型推理的高级 API
-
 - **structures** 提供 bbox、keypoint 和 PoseDataSample 等数据结构
-
 - **datasets** 支持用于姿态估计的各种数据集
-
   - **transforms** 包含各种数据增强变换
-
 - **codecs** 提供姿态编解码器：编码器用于将姿态信息（通常为关键点坐标）编码为模型学习目标（如热力图），解码器则用于将模型输出解码为姿态估计结果
-
 - **models** 以模块化结构提供了姿态估计模型的各类组件
-
   - **pose_estimators** 定义了所有姿态估计模型类
   - **data_preprocessors** 用于预处理模型的输入数据
   - **backbones** 包含各种骨干网络
   - **necks** 包含各种模型颈部组件
   - **heads** 包含各种模型头部
   - **losses** 包含各种损失函数
-
 - **engine** 包含与姿态估计任务相关的运行时组件
-
   - **hooks** 提供运行时的各种钩子
-
 - **evaluation** 提供各种评估模型性能的指标
-
 - **visualization** 用于可视化关键点骨架和热力图等信息
 
 ## 如何使用本指南
diff --git a/docs/zh_cn/user_guides/configs.md b/docs/zh_cn/user_guides/configs.md
index c24d8c7b3e..bac64c8869 100644
--- a/docs/zh_cn/user_guides/configs.md
+++ b/docs/zh_cn/user_guides/configs.md
@@ -2,6 +2,25 @@
 
 MMPose 使用 Python 文件作为配置文件，将模块化设计和继承设计结合到配置系统中，便于进行各种实验。
 
+## 目录结构
+
+MMPose 的配置文件目录结构如下：
+
+```shell
+configs
+|----_base_
+     |----datasets
+     |----default_runtime.py
+|----animal_2d_keypoint
+|----body_2d_keypoint
+|----body_3d_keypoint
+|----face_2d_keypoint
+|----fashion_2d_keypoint
+|----hand_2d_keypoint
+|----hand_3d_keypoint
+|----wholebody_2d_keypoint
+```
+
 ## 简介
 
 MMPose 拥有一套强大的配置系统，在注册器的配合下，用户可以通过一个配置文件来定义整个项目需要用到的所有内容，以 Python 字典形式组织配置信息，传递给注册器完成对应模块的实例化。

From 482e24b4ff891a52ce608df36161d3a042f5ca50 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Fri, 21 Jul 2023 13:03:54 +0800
Subject: [PATCH 14/37] [Refactor] 3d human pose demo (#2554)

---
 demo/body3d_pose_lifter_demo.py    | 248 ++++++++++++++++-------------
 demo/docs/en/3d_human_pose_demo.md |  28 ++--
 2 files changed, 152 insertions(+), 124 deletions(-)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index 256894fb3c..72e7b93958 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -12,7 +12,6 @@
 import mmengine
 import numpy as np
 from mmengine.logging import print_log
-from mmengine.structures import InstanceData
 
 from mmpose.apis import (_track_by_iou, _track_by_oks, collect_multi_frames,
                          convert_keypoint_definition, extract_pose_sequence,
@@ -59,12 +58,13 @@ def parse_args():
         default=False,
         help='Whether to show visualizations')
     parser.add_argument(
-        '--rebase-keypoint-height',
+        '--disable-rebase-keypoint',
         action='store_true',
-        help='Rebase the predicted 3D pose so its lowest keypoint has a '
-        'height of 0 (landing on the ground). This is useful for '
-        'visualization when the model do not predict the global position '
-        'of the 3D pose.')
+        default=False,
+        help='Whether to disable rebasing the predicted 3D pose so its '
+        'lowest keypoint has a height of 0 (landing on the ground). Rebase '
+        'is useful for visualization when the model do not predict the '
+        'global position of the 3D pose.')
     parser.add_argument(
         '--norm-pose-2d',
         action='store_true',
@@ -75,7 +75,7 @@ def parse_args():
     parser.add_argument(
         '--num-instances',
         type=int,
-        default=-1,
+        default=1,
         help='The number of 3D poses to be visualized in every frame. If '
         'less than 0, it will be set to the number of pose results in the '
         'first frame.')
@@ -130,16 +130,74 @@ def parse_args():
     return args
 
 
-def get_area(results):
-    for i, data_sample in enumerate(results):
-        pred_instance = data_sample.pred_instances.cpu().numpy()
-        if 'bboxes' in pred_instance:
-            bboxes = pred_instance.bboxes
-            results[i].pred_instances.set_field(
-                np.array([(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-                          for bbox in bboxes]), 'areas')
+def process_one_image(args, detector, frame, frame_idx, pose_estimator,
+                      pose_est_frame, pose_est_results_last,
+                      pose_est_results_list, next_id, pose_lifter,
+                      visualize_frame, visualizer):
+    """Visualize detected and predicted keypoints of one image.
+
+    Args:
+        args (Argument): Custom command-line arguments.
+        detector (mmdet.BaseDetector): The mmdet detector.
+        frame (np.ndarray): The image frame read from input image or video.
+        frame_idx (int): The index of current frame.
+        pose_estimator (TopdownPoseEstimator): The pose estimator for 2d pose.
+        pose_est_frame (np.ndarray | list(np.ndarray)): The frames for pose
+            estimation.
+        pose_est_results_last (list(PoseDataSample)): The results of pose
+            estimation from the last frame for tracking instances.
+        pose_est_results_list (list(list(PoseDataSample))): The list of all
+            pose estimation results converted by
+            ``convert_keypoint_definition`` from previous frames. In
+            pose-lifting stage it is used to obtain the 2d estimation sequence.
+        next_id (int): The next track id to be used.
+        pose_lifter (PoseLifter): The pose-lifter for estimating 3d pose.
+        visualize_frame (np.ndarray): The image for drawing the results on.
+        visualizer (Visualizer): The visualizer for visualizing the 2d and 3d
+            pose estimation results.
+
+    Returns:
+        pose_est_results (list(PoseDataSample)): The pose estimation result of
+            the current frame.
+        pose_est_results_list (list(list(PoseDataSample))): The list of all
+            converted pose estimation results until the current frame.
+        pred_3d_instances (InstanceData): The result of pose-lifting.
+            Specifically, the predicted keypoints and scores are saved at
+            ``pred_3d_instances.keypoints`` and
+            ``pred_3d_instances.keypoint_scores``.
+        next_id (int): The next track id to be used.
+    """
+    pose_lift_dataset = pose_lifter.cfg.test_dataloader.dataset
+
+    det_result = inference_detector(detector, frame)
+    pred_instance = det_result.pred_instances.cpu().numpy()
+
+    # First stage: 2D pose detection
+    bboxes = pred_instance.bboxes
+    bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id,
+                                   pred_instance.scores > args.bbox_thr)]
+
+    # estimate pose results for current image
+    pose_est_results = inference_topdown(pose_estimator, pose_est_frame,
+                                         bboxes)
+
+    if args.use_oks_tracking:
+        _track = partial(_track_by_oks)
+    else:
+        _track = _track_by_iou
+
+    pose_det_dataset = pose_estimator.cfg.test_dataloader.dataset
+    pose_est_results_converted = []
+
+    for i, data_sample in enumerate(pose_est_results):
+        pred_instances = data_sample.pred_instances.cpu().numpy()
+        keypoints = pred_instances.keypoints
+        # calculate area and bbox
+        if 'bboxes' in pred_instances:
+            areas = np.array([(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                              for bbox in pred_instances.bboxes])
+            pose_est_results[i].pred_instances.set_field(areas, 'areas')
         else:
-            keypoints = pred_instance.keypoints
             areas, bboxes = [], []
             for keypoint in keypoints:
                 xmin = np.min(keypoint[:, 0][keypoint[:, 0] > 0], initial=1e10)
@@ -148,32 +206,16 @@ def get_area(results):
                 ymax = np.max(keypoint[:, 1])
                 areas.append((xmax - xmin) * (ymax - ymin))
                 bboxes.append([xmin, ymin, xmax, ymax])
-            results[i].pred_instances.areas = np.array(areas)
-            results[i].pred_instances.bboxes = np.array(bboxes)
-    return results
+            pose_est_results[i].pred_instances.areas = np.array(areas)
+            pose_est_results[i].pred_instances.bboxes = np.array(bboxes)
 
-
-def get_pose_est_results(args, pose_estimator, frame, bboxes,
-                         pose_est_results_last, next_id, pose_lift_dataset):
-    pose_det_dataset = pose_estimator.cfg.test_dataloader.dataset
-
-    # make person results for current image
-    pose_est_results = inference_topdown(pose_estimator, frame, bboxes)
-
-    pose_est_results = get_area(pose_est_results)
-    if args.use_oks_tracking:
-        _track = partial(_track_by_oks)
-    else:
-        _track = _track_by_iou
-
-    for i, result in enumerate(pose_est_results):
-        track_id, pose_est_results_last, match_result = _track(
-            result, pose_est_results_last, args.tracking_thr)
+        # track id
+        track_id, pose_est_results_last, _ = _track(data_sample,
+                                                    pose_est_results_last,
+                                                    args.tracking_thr)
         if track_id == -1:
-            pred_instances = result.pred_instances.cpu().numpy()
-            keypoints = pred_instances.keypoints
             if np.count_nonzero(keypoints[:, :, 1]) >= 3:
-                pose_est_results[i].set_field(next_id, 'track_id')
+                track_id = next_id
                 next_id += 1
             else:
                 # If the number of keypoints detected is small,
@@ -181,39 +223,30 @@ def get_pose_est_results(args, pose_estimator, frame, bboxes,
                 keypoints[:, :, 1] = -10
                 pose_est_results[i].pred_instances.set_field(
                     keypoints, 'keypoints')
-                bboxes = pred_instances.bboxes * 0
-                pose_est_results[i].pred_instances.set_field(bboxes, 'bboxes')
-                pose_est_results[i].set_field(-1, 'track_id')
+                pose_est_results[i].pred_instances.set_field(
+                    pred_instances.bboxes * 0, 'bboxes')
                 pose_est_results[i].set_field(pred_instances, 'pred_instances')
-        else:
-            pose_est_results[i].set_field(track_id, 'track_id')
+                track_id = -1
+        pose_est_results[i].set_field(track_id, 'track_id')
 
-        del match_result
-
-    pose_est_results_converted = []
-    for pose_est_result in pose_est_results:
+        # convert keypoints for pose-lifting
         pose_est_result_converted = PoseDataSample()
-        gt_instances = InstanceData()
-        pred_instances = InstanceData()
-        for k in pose_est_result.gt_instances.keys():
-            gt_instances.set_field(pose_est_result.gt_instances[k], k)
-        for k in pose_est_result.pred_instances.keys():
-            pred_instances.set_field(pose_est_result.pred_instances[k], k)
-        pose_est_result_converted.gt_instances = gt_instances
-        pose_est_result_converted.pred_instances = pred_instances
-        pose_est_result_converted.track_id = pose_est_result.track_id
-
-        keypoints = convert_keypoint_definition(pred_instances.keypoints,
+        pose_est_result_converted.set_field(
+            pose_est_results[i].pred_instances.clone(), 'pred_instances')
+        pose_est_result_converted.set_field(
+            pose_est_results[i].gt_instances.clone(), 'gt_instances')
+        keypoints = convert_keypoint_definition(keypoints,
                                                 pose_det_dataset['type'],
                                                 pose_lift_dataset['type'])
-        pose_est_result_converted.pred_instances.keypoints = keypoints
+        pose_est_result_converted.pred_instances.set_field(
+            keypoints, 'keypoints')
+        pose_est_result_converted.set_field(pose_est_results[i].track_id,
+                                            'track_id')
         pose_est_results_converted.append(pose_est_result_converted)
-    return pose_est_results, pose_est_results_converted, next_id
 
+    pose_est_results_list.append(pose_est_results_converted.copy())
 
-def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
-                          frame, frame_idx, pose_est_results):
-    pose_lift_dataset = pose_lifter.cfg.test_dataloader.dataset
+    # Second stage: Pose lifting
     # extract and pad input pose2d sequence
     pose_seq_2d = extract_pose_sequence(
         pose_est_results_list,
@@ -223,18 +256,17 @@ def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
         step=pose_lift_dataset.get('seq_step', 1))
 
     # 2D-to-3D pose lifting
-    width, height = frame.shape[:2]
     pose_lift_results = inference_pose_lifter_model(
         pose_lifter,
         pose_seq_2d,
-        image_size=(width, height),
+        image_size=visualize_frame.shape[:2],
         norm_pose_2d=args.norm_pose_2d)
 
-    # Pose processing
-    for idx, pose_lift_res in enumerate(pose_lift_results):
-        pose_lift_res.track_id = pose_est_results[idx].get('track_id', 1e4)
+    # post-processing
+    for idx, pose_lift_result in enumerate(pose_lift_results):
+        pose_lift_result.track_id = pose_est_results[idx].get('track_id', 1e4)
 
-        pred_instances = pose_lift_res.pred_instances
+        pred_instances = pose_lift_result.pred_instances
         keypoints = pred_instances.keypoints
         keypoint_scores = pred_instances.keypoint_scores
         if keypoint_scores.ndim == 3:
@@ -249,7 +281,7 @@ def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
         keypoints[..., 2] = -keypoints[..., 2]
 
         # rebase height (z-axis)
-        if args.rebase_keypoint_height:
+        if not args.disable_rebase_keypoint:
             keypoints[..., 2] -= np.min(
                 keypoints[..., 2], axis=-1, keepdims=True)
 
@@ -260,6 +292,7 @@ def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
 
     pred_3d_data_samples = merge_data_samples(pose_lift_results)
     det_data_sample = merge_data_samples(pose_est_results)
+    pred_3d_instances = pred_3d_data_samples.get('pred_instances', None)
 
     if args.num_instances < 0:
         args.num_instances = len(pose_lift_results)
@@ -268,7 +301,7 @@ def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
     if visualizer is not None:
         visualizer.add_datasample(
             'result',
-            frame,
+            visualize_frame,
             data_sample=pred_3d_data_samples,
             det_data_sample=det_data_sample,
             draw_gt=False,
@@ -278,17 +311,7 @@ def get_pose_lift_results(args, visualizer, pose_lifter, pose_est_results_list,
             num_instances=args.num_instances,
             wait_time=args.show_interval)
 
-    return pred_3d_data_samples.get('pred_instances', None)
-
-
-def get_bbox(args, detector, frame):
-    det_result = inference_detector(detector, frame)
-    pred_instance = det_result.pred_instances.cpu().numpy()
-
-    bboxes = pred_instance.bboxes
-    bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id,
-                                   pred_instance.scores > args.bbox_thr)]
-    return bboxes
+    return pose_est_results, pose_est_results_list, pred_3d_instances, next_id
 
 
 def main():
@@ -333,7 +356,6 @@ def main():
     assert isinstance(pose_lifter, PoseLifter), \
         'Only "PoseLifter" model is supported for the 2nd stage ' \
         '(2D-to-3D lifting)'
-    pose_lift_dataset = pose_lifter.cfg.test_dataloader.dataset
 
     pose_lifter.cfg.visualizer.radius = args.radius
     pose_lifter.cfg.visualizer.line_width = args.thickness
@@ -372,19 +394,23 @@ def main():
     pred_instances_list = []
     if input_type == 'image':
         frame = mmcv.imread(args.input, channel_order='rgb')
-
-        # First stage: 2D pose detection
-        bboxes = get_bbox(args, detector, frame)
-        pose_est_results, pose_est_results_converted, _ = get_pose_est_results(
-            args, pose_estimator, frame, bboxes, [], 0, pose_lift_dataset)
-        pose_est_results_list.append(pose_est_results_converted.copy())
-        pred_3d_pred = get_pose_lift_results(args, visualizer, pose_lifter,
-                                             pose_est_results_list, frame, 0,
-                                             pose_est_results)
+        _, _, pred_3d_instances, _ = process_one_image(
+            args=args,
+            detector=detector,
+            frame=frame,
+            frame_idx=0,
+            pose_estimator=pose_estimator,
+            pose_est_frame=frame,
+            pose_est_results_last=[],
+            pose_est_results_list=pose_est_results_list,
+            next_id=0,
+            pose_lifter=pose_lifter,
+            visualize_frame=frame,
+            visualizer=visualizer)
 
         if args.save_predictions:
             # save prediction results
-            pred_instances_list = split_instances(pred_3d_pred)
+            pred_instances_list = split_instances(pred_3d_instances)
 
         if save_output:
             frame_vis = visualizer.get_image()
@@ -392,7 +418,7 @@ def main():
 
     elif input_type in ['webcam', 'video']:
         next_id = 0
-        pose_est_results_converted = []
+        pose_est_results = []
 
         if args.input == 'webcam':
             video = cv2.VideoCapture(0)
@@ -415,33 +441,37 @@ def main():
             if not success:
                 break
 
-            pose_est_results_last = pose_est_results_converted
+            pose_est_results_last = pose_est_results
 
             # First stage: 2D pose detection
+            pose_est_frame = frame
             if args.use_multi_frames:
                 frames = collect_multi_frames(video, frame_idx, indices,
                                               args.online)
+                pose_est_frame = frames
 
             # make person results for current image
-            bboxes = get_bbox(args, detector, frame)
-            pose_est_results, pose_est_results_converted, next_id = get_pose_est_results(  # noqa: E501
-                args, pose_estimator,
-                frames if args.use_multi_frames else frame, bboxes,
-                pose_est_results_last, next_id, pose_lift_dataset)
-            pose_est_results_list.append(pose_est_results_converted.copy())
-
-            # Second stage: Pose lifting
-            pred_3d_pred = get_pose_lift_results(args, visualizer, pose_lifter,
-                                                 pose_est_results_list,
-                                                 mmcv.bgr2rgb(frame),
-                                                 frame_idx, pose_est_results)
+            (pose_est_results, pose_est_results_list, pred_3d_instances,
+             next_id) = process_one_image(
+                 args=args,
+                 detector=detector,
+                 frame=frame,
+                 frame_idx=frame_idx,
+                 pose_estimator=pose_estimator,
+                 pose_est_frame=pose_est_frame,
+                 pose_est_results_last=pose_est_results_last,
+                 pose_est_results_list=pose_est_results_list,
+                 next_id=next_id,
+                 pose_lifter=pose_lifter,
+                 visualize_frame=mmcv.bgr2rgb(frame),
+                 visualizer=visualizer)
 
             if args.save_predictions:
                 # save prediction results
                 pred_instances_list.append(
                     dict(
                         frame_id=frame_idx,
-                        instances=split_instances(pred_3d_pred)))
+                        instances=split_instances(pred_3d_instances)))
 
             if save_output:
                 frame_vis = visualizer.get_image()
diff --git a/demo/docs/en/3d_human_pose_demo.md b/demo/docs/en/3d_human_pose_demo.md
index 367d98c403..b46c740de6 100644
--- a/demo/docs/en/3d_human_pose_demo.md
+++ b/demo/docs/en/3d_human_pose_demo.md
@@ -18,22 +18,22 @@ ${MMPOSE_CONFIG_FILE_3D} \
 ${MMPOSE_CHECKPOINT_FILE_3D} \
 --input ${VIDEO_PATH or IMAGE_PATH or 'webcam'} \
 [--show] \
-[--rebase-keypoint-height] \
+[--disable-rebase-keypoint] \
 [--norm-pose-2d] \
-[--num-instances] \
+[--num-instances ${NUM_INSTANCES}] \
 [--output-root ${OUT_VIDEO_ROOT}] \
-[--save-predictions]
 [--save-predictions] \
 [--device ${GPU_ID  or  CPU}] \
-[--det-cat-id DET_CAT_ID] \
-[--bbox-thr BBOX_THR] \
-[--kpt-thr KPT_THR] \
+[--det-cat-id ${DET_CAT_ID}] \
+[--bbox-thr ${BBOX_THR}] \
+[--kpt-thr ${KPT_THR}] \
 [--use-oks-tracking] \
-[--tracking-thr TRACKING_THR] \
-[--show-interval INTERVAL] \
-[--thickness THICKNESS] \
-[--radius RADIUS] \
-[--use-multi-frames] [--online]
+[--tracking-thr ${TRACKING_THR}] \
+[--show-interval ${INTERVAL}] \
+[--thickness ${THICKNESS}] \
+[--radius ${RADIUS}] \
+[--use-multi-frames] \
+[--online]
 ```
 
 Note that
@@ -58,7 +58,7 @@ configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft
 https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth  \
 --input https://user-images.githubusercontent.com/87690686/164970135-b14e424c-765a-4180-9bc8-fa8d6abc5510.mp4 \
 --output-root  vis_results  \
---rebase-keypoint-height  --save-predictions
+--save-predictions
 ```
 
 During 2D pose detection, for multi-frame inference that rely on extra frames to get the final results of the current frame, try this:
@@ -73,7 +73,6 @@ configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft
 https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth  \
 --input https://user-images.githubusercontent.com/87690686/164970135-b14e424c-765a-4180-9bc8-fa8d6abc5510.mp4 \
 --output-root  vis_results  \
---rebase-keypoint-height \
 --use-multi-frames  --online
 ```
 
@@ -83,8 +82,7 @@ The Inferencer provides a convenient interface for inference, allowing customiza
 
 ```shell
 python demo/inferencer_demo.py tests/data/coco/000000000785.jpg \
-    --pose3d human3d --vis-out-dir vis_results/human3d \
-    --rebase-keypoint-height
+    --pose3d human3d --vis-out-dir vis_results/human3d
 ```
 
 This command infers the image and saves the visualization results in the `vis_results/human3d` directory.

From 58d2691f79c857ffebb69a47b46f50c713520d5a Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Fri, 21 Jul 2023 13:45:18 +0800
Subject: [PATCH 15/37] [Docs] Update MotionBERT docs (#2559)

---
 configs/body_3d_keypoint/pose_lift/README.md          | 11 ++++++++++-
 .../pose_lift/h36m/motionbert_h36m.md                 |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/configs/body_3d_keypoint/pose_lift/README.md b/configs/body_3d_keypoint/pose_lift/README.md
index e3e6ff7176..66cd0548ae 100644
--- a/configs/body_3d_keypoint/pose_lift/README.md
+++ b/configs/body_3d_keypoint/pose_lift/README.md
@@ -26,7 +26,16 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
 | [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
 | [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |  27.7   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |  21.6   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py) | 27.5  |  21.6   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+#### Human3.6m Dataset from official repo <sup>1</sup>
+
+| Arch                                                           | MPJPE | Average MPJPE | P-MPJPE |                              ckpt                               | log |              Details and Download               |
+| :------------------------------------------------------------- | :---: | :-----------: | :-----: | :-------------------------------------------------------------: | :-: | :---------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+<sup>1</sup> Please refer to the [doc](./h36m/motionbert_h36m.md) for more details.
 
 *Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
 
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
index 93cd29eddd..f23aa13a2e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -50,6 +50,6 @@ Testing results on Human3.6M dataset converted from the [official repo](https://
 | [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
 | [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
 
-<sup>1</sup> To test with the dataset from official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50`.
+<sup>1</sup> By default, we test models with [Human 3.6m dataset](/docs/en/dataset_zoo/3d_body_keypoint.md#human3-6m) processed by MMPose. The official repo's dataset includes more data and applies a different pre-processing technique. To achieve the same result with the official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50` and test with the configs we provided.
 
 *Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*

From cbbea686372be161a963a28ca7a489ba3699aed0 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Fri, 21 Jul 2023 15:14:08 +0800
Subject: [PATCH 16/37] [Refactor] Update the arguments of 3d inferencer to
 align with the demo script (#2561)

---
 demo/inferencer_demo.py                      | 18 ++++++---
 docs/en/user_guides/inference.md             | 39 ++++++++++----------
 docs/zh_cn/user_guides/inference.md          | 39 ++++++++++----------
 mmpose/apis/inferencers/mmpose_inferencer.py |  4 +-
 mmpose/apis/inferencers/pose3d_inferencer.py | 18 ++++++---
 5 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/demo/inferencer_demo.py b/demo/inferencer_demo.py
index b91e91f74b..7053768e69 100644
--- a/demo/inferencer_demo.py
+++ b/demo/inferencer_demo.py
@@ -104,12 +104,20 @@ def parse_args():
         'the average bbox center of the dataset. This is useful when bbox '
         'is small, especially in multi-person scenarios.')
     parser.add_argument(
-        '--rebase-keypoint-height',
+        '--disable-rebase-keypoint',
         action='store_true',
-        help='Rebase the predicted 3D pose so its lowest keypoint has a '
-        'height of 0 (landing on the ground). This is useful for '
-        'visualization when the model do not predict the global position '
-        'of the 3D pose.')
+        default=False,
+        help='Whether to disable rebasing the predicted 3D pose so its '
+        'lowest keypoint has a height of 0 (landing on the ground). Rebase '
+        'is useful for visualization when the model do not predict the '
+        'global position of the 3D pose.')
+    parser.add_argument(
+        '--num-instances',
+        type=int,
+        default=1,
+        help='The number of 3D poses to be visualized in every frame. If '
+        'less than 0, it will be set to the number of pose results in the '
+        'first frame.')
     parser.add_argument(
         '--radius',
         type=int,
diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md
index fa51aa20fa..518b2e89d3 100644
--- a/docs/en/user_guides/inference.md
+++ b/docs/en/user_guides/inference.md
@@ -235,25 +235,26 @@ The `MMPoseInferencer` offers a variety of arguments for customizing pose estima
 
 The inferencer is designed for both visualization and saving predictions. The table below presents the list of arguments available when using the `MMPoseInferencer` for inference, along with their compatibility with 2D and 3D inferencing:
 
-| Argument                 | Description                                                                                                                                                       | 2D  | 3D  |
-| ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | --- |
-| `show`                   | Controls the display of the image or video in a pop-up window.                                                                                                    | ✔️  | ✔️  |
-| `radius`                 | Sets the visualization keypoint radius.                                                                                                                           | ✔️  | ✔️  |
-| `thickness`              | Determines the link thickness for visualization.                                                                                                                  | ✔️  | ✔️  |
-| `kpt_thr`                | Sets the keypoint score threshold. Keypoints with scores exceeding this threshold will be displayed.                                                              | ✔️  | ✔️  |
-| `draw_bbox`              | Decides whether to display the bounding boxes of instances.                                                                                                       | ✔️  | ✔️  |
-| `draw_heatmap`           | Decides if the predicted heatmaps should be drawn.                                                                                                                | ✔️  | ❌  |
-| `black_background`       | Decides whether the estimated poses should be displayed on a black background.                                                                                    | ✔️  | ❌  |
-| `skeleton_style`         | Sets the skeleton style. Options include 'mmpose' (default) and 'openpose'.                                                                                       | ✔️  | ❌  |
-| `use_oks_tracking`       | Decides whether to use OKS as a similarity measure in tracking.                                                                                                   | ❌  | ✔️  |
-| `tracking_thr`           | Sets the similarity threshold for tracking.                                                                                                                       | ❌  | ✔️  |
-| `norm_pose_2d`           | Decides whether to scale the bounding box to the dataset's average bounding box scale and relocate the bounding box to the dataset's average bounding box center. | ❌  | ✔️  |
-| `rebase_keypoint_height` | Decides whether to set the lowest keypoint with height 0.                                                                                                         | ❌  | ✔️  |
-| `return_vis`             | Decides whether to include visualization images in the results.                                                                                                   | ✔️  | ✔️  |
-| `vis_out_dir`            | Defines the folder path to save the visualization images. If unset, the visualization images will not be saved.                                                   | ✔️  | ✔️  |
-| `return_datasample`      | Determines if the prediction should be returned in the `PoseDataSample` format.                                                                                   | ✔️  | ✔️  |
-| `pred_out_dir`           | Specifies the folder path to save the predictions. If unset, the predictions will not be saved.                                                                   | ✔️  | ✔️  |
-| `out_dir`                | If `vis_out_dir` or `pred_out_dir` is unset, these will be set to `f'{out_dir}/visualization'` or `f'{out_dir}/predictions'`, respectively.                       | ✔️  | ✔️  |
+| Argument                  | Description                                                                                                                                                       | 2D  | 3D  |
+| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | --- |
+| `show`                    | Controls the display of the image or video in a pop-up window.                                                                                                    | ✔️  | ✔️  |
+| `radius`                  | Sets the visualization keypoint radius.                                                                                                                           | ✔️  | ✔️  |
+| `thickness`               | Determines the link thickness for visualization.                                                                                                                  | ✔️  | ✔️  |
+| `kpt_thr`                 | Sets the keypoint score threshold. Keypoints with scores exceeding this threshold will be displayed.                                                              | ✔️  | ✔️  |
+| `draw_bbox`               | Decides whether to display the bounding boxes of instances.                                                                                                       | ✔️  | ✔️  |
+| `draw_heatmap`            | Decides if the predicted heatmaps should be drawn.                                                                                                                | ✔️  | ❌  |
+| `black_background`        | Decides whether the estimated poses should be displayed on a black background.                                                                                    | ✔️  | ❌  |
+| `skeleton_style`          | Sets the skeleton style. Options include 'mmpose' (default) and 'openpose'.                                                                                       | ✔️  | ❌  |
+| `use_oks_tracking`        | Decides whether to use OKS as a similarity measure in tracking.                                                                                                   | ❌  | ✔️  |
+| `tracking_thr`            | Sets the similarity threshold for tracking.                                                                                                                       | ❌  | ✔️  |
+| `norm_pose_2d`            | Decides whether to scale the bounding box to the dataset's average bounding box scale and relocate the bounding box to the dataset's average bounding box center. | ❌  | ✔️  |
+| `disable_rebase_keypoint` | Decides whether to set the lowest keypoint with height 0.                                                                                                         | ❌  | ✔️  |
+| `num_instances`           | Sets the number of instances to visualize in the results. If set to a negative number, all detected instances will be visualized.                                 | ❌  | ✔️  |
+| `return_vis`              | Decides whether to include visualization images in the results.                                                                                                   | ✔️  | ✔️  |
+| `vis_out_dir`             | Defines the folder path to save the visualization images. If unset, the visualization images will not be saved.                                                   | ✔️  | ✔️  |
+| `return_datasample`       | Determines if the prediction should be returned in the `PoseDataSample` format.                                                                                   | ✔️  | ✔️  |
+| `pred_out_dir`            | Specifies the folder path to save the predictions. If unset, the predictions will not be saved.                                                                   | ✔️  | ✔️  |
+| `out_dir`                 | If `vis_out_dir` or `pred_out_dir` is unset, these will be set to `f'{out_dir}/visualization'` or `f'{out_dir}/predictions'`, respectively.                       | ✔️  | ✔️  |
 
 ### Model Alias
 
diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md
index 0844bc611f..4017b32d84 100644
--- a/docs/zh_cn/user_guides/inference.md
+++ b/docs/zh_cn/user_guides/inference.md
@@ -223,25 +223,26 @@ result = next(result_generator)
 
 推理器被设计用于可视化和保存预测。以下表格列出了在使用 `MMPoseInferencer` <mark>进行推断</mark>时可用的参数列表，以及它们与 2D 和 3D 推理器的兼容性：
 
-| 参数                     | 描述                                                                                                                       | 2D  | 3D  |
-| ------------------------ | -------------------------------------------------------------------------------------------------------------------------- | --- | --- |
-| `show`                   | 控制是否在弹出窗口中显示图像或视频。                                                                                       | ✔️  | ✔️  |
-| `radius`                 | 设置可视化关键点的半径。                                                                                                   | ✔️  | ✔️  |
-| `thickness`              | 确定可视化链接的厚度。                                                                                                     | ✔️  | ✔️  |
-| `kpt_thr`                | 设置关键点分数阈值。分数超过此阈值的关键点将被显示。                                                                       | ✔️  | ✔️  |
-| `draw_bbox`              | 决定是否显示实例的边界框。                                                                                                 | ✔️  | ✔️  |
-| `draw_heatmap`           | 决定是否绘制预测的热图。                                                                                                   | ✔️  | ❌  |
-| `black_background`       | 决定是否在黑色背景上显示预估的姿势。                                                                                       | ✔️  | ❌  |
-| `skeleton_style`         | 设置骨架样式。可选项包括 'mmpose'（默认）和 'openpose'。                                                                   | ✔️  | ❌  |
-| `use_oks_tracking`       | 决定是否在追踪中使用OKS作为相似度测量。                                                                                    | ❌  | ✔️  |
-| `tracking_thr`           | 设置追踪的相似度阈值。                                                                                                     | ❌  | ✔️  |
-| `norm_pose_2d`           | 决定是否将边界框缩放至数据集的平均边界框尺寸，并将边界框移至数据集的平均边界框中心。                                       | ❌  | ✔️  |
-| `rebase_keypoint_height` | 决定是否将最低关键点的高度置为 0。                                                                                         | ❌  | ✔️  |
-| `return_vis`             | 决定是否在结果中包含可视化图像。                                                                                           | ✔️  | ✔️  |
-| `vis_out_dir`            | 定义保存可视化图像的文件夹路径。如果未设置，将不保存可视化图像。                                                           | ✔️  | ✔️  |
-| `return_datasample`      | 决定是否以 `PoseDataSample` 格式返回预测。                                                                                 | ✔️  | ✔️  |
-| `pred_out_dir`           | 指定保存预测的文件夹路径。如果未设置，将不保存预测。                                                                       | ✔️  | ✔️  |
-| `out_dir`                | 如果 `vis_out_dir` 或 `pred_out_dir` 未设置，它们将分别设置为 `f'{out_dir}/visualization'` 或 `f'{out_dir}/predictions'`。 | ✔️  | ✔️  |
+| 参数                      | 描述                                                                                                                       | 2D  | 3D  |
+| ------------------------- | -------------------------------------------------------------------------------------------------------------------------- | --- | --- |
+| `show`                    | 控制是否在弹出窗口中显示图像或视频。                                                                                       | ✔️  | ✔️  |
+| `radius`                  | 设置可视化关键点的半径。                                                                                                   | ✔️  | ✔️  |
+| `thickness`               | 确定可视化链接的厚度。                                                                                                     | ✔️  | ✔️  |
+| `kpt_thr`                 | 设置关键点分数阈值。分数超过此阈值的关键点将被显示。                                                                       | ✔️  | ✔️  |
+| `draw_bbox`               | 决定是否显示实例的边界框。                                                                                                 | ✔️  | ✔️  |
+| `draw_heatmap`            | 决定是否绘制预测的热图。                                                                                                   | ✔️  | ❌  |
+| `black_background`        | 决定是否在黑色背景上显示预估的姿势。                                                                                       | ✔️  | ❌  |
+| `skeleton_style`          | 设置骨架样式。可选项包括 'mmpose'（默认）和 'openpose'。                                                                   | ✔️  | ❌  |
+| `use_oks_tracking`        | 决定是否在追踪中使用OKS作为相似度测量。                                                                                    | ❌  | ✔️  |
+| `tracking_thr`            | 设置追踪的相似度阈值。                                                                                                     | ❌  | ✔️  |
+| `norm_pose_2d`            | 决定是否将边界框缩放至数据集的平均边界框尺寸，并将边界框移至数据集的平均边界框中心。                                       | ❌  | ✔️  |
+| `disable_rebase_keypoint` | 决定是否将最低关键点的高度置为 0。                                                                                         | ❌  | ✔️  |
+| `num_instances`           | 设置可视化结果中显示的实例数量。如果设置为负数，则所有实例的结果都会可视化。                                               | ❌  | ✔️  |
+| `return_vis`              | 决定是否在结果中包含可视化图像。                                                                                           | ✔️  | ✔️  |
+| `vis_out_dir`             | 定义保存可视化图像的文件夹路径。如果未设置，将不保存可视化图像。                                                           | ✔️  | ✔️  |
+| `return_datasample`       | 决定是否以 `PoseDataSample` 格式返回预测。                                                                                 | ✔️  | ✔️  |
+| `pred_out_dir`            | 指定保存预测的文件夹路径。如果未设置，将不保存预测。                                                                       | ✔️  | ✔️  |
+| `out_dir`                 | 如果 `vis_out_dir` 或 `pred_out_dir` 未设置，它们将分别设置为 `f'{out_dir}/visualization'` 或 `f'{out_dir}/predictions'`。 | ✔️  | ✔️  |
 
 ### 模型别名
 
diff --git a/mmpose/apis/inferencers/mmpose_inferencer.py b/mmpose/apis/inferencers/mmpose_inferencer.py
index b44361bba8..3ec958223f 100644
--- a/mmpose/apis/inferencers/mmpose_inferencer.py
+++ b/mmpose/apis/inferencers/mmpose_inferencer.py
@@ -58,11 +58,11 @@ class MMPoseInferencer(BaseMMPoseInferencer):
         'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
         'norm_pose_2d'
     }
-    forward_kwargs: set = {'rebase_keypoint_height'}
+    forward_kwargs: set = {'disable_rebase_keypoint'}
     visualize_kwargs: set = {
         'return_vis', 'show', 'wait_time', 'draw_bbox', 'radius', 'thickness',
         'kpt_thr', 'vis_out_dir', 'skeleton_style', 'draw_heatmap',
-        'black_background'
+        'black_background', 'num_instances'
     }
     postprocess_kwargs: set = {'pred_out_dir'}
 
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 6afc70f62d..0ab7d2e64e 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -71,7 +71,7 @@ class Pose3DInferencer(BaseMMPoseInferencer):
         'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
         'norm_pose_2d'
     }
-    forward_kwargs: set = {'rebase_keypoint_height'}
+    forward_kwargs: set = {'disable_rebase_keypoint'}
     visualize_kwargs: set = {
         'return_vis',
         'show',
@@ -79,6 +79,7 @@ class Pose3DInferencer(BaseMMPoseInferencer):
         'draw_bbox',
         'radius',
         'thickness',
+        'num_instances',
         'kpt_thr',
         'vis_out_dir',
     }
@@ -290,13 +291,13 @@ def preprocess_single(self,
     @torch.no_grad()
     def forward(self,
                 inputs: Union[dict, tuple],
-                rebase_keypoint_height: bool = False):
+                disable_rebase_keypoint: bool = False):
         """Perform forward pass through the model and process the results.
 
         Args:
             inputs (Union[dict, tuple]): The inputs for the model.
-            rebase_keypoint_height (bool, optional): Flag to rebase the
-                height of the keypoints (z-axis). Defaults to False.
+            disable_rebase_keypoint (bool, optional): Flag to disable rebasing
+                the height of the keypoints. Defaults to False.
 
         Returns:
             list: A list of data samples, each containing the model's output
@@ -326,7 +327,7 @@ def forward(self,
             keypoints[..., 2] = -keypoints[..., 2]
 
             # If rebase_keypoint_height is True, adjust z-axis values
-            if rebase_keypoint_height:
+            if not disable_rebase_keypoint:
                 keypoints[..., 2] -= np.min(
                     keypoints[..., 2], axis=-1, keepdims=True)
 
@@ -420,6 +421,7 @@ def visualize(self,
                   radius: int = 3,
                   thickness: int = 1,
                   kpt_thr: float = 0.3,
+                  num_instances: int = 1,
                   vis_out_dir: str = '',
                   window_name: str = '',
                   window_close_event_handler: Optional[Callable] = None
@@ -480,6 +482,9 @@ def visualize(self,
             # thereby eliminating the issue of inference getting stuck.
             wait_time = 1e-5 if self._video_input else wait_time
 
+            if num_instances < 0:
+                num_instances = len(pred.pred_instances)
+
             visualization = self.visualizer.add_datasample(
                 window_name,
                 img,
@@ -489,7 +494,8 @@ def visualize(self,
                 draw_bbox=draw_bbox,
                 show=show,
                 wait_time=wait_time,
-                kpt_thr=kpt_thr)
+                kpt_thr=kpt_thr,
+                num_instances=num_instances)
             results.append(visualization)
 
             if vis_out_dir:

From 4233a611beba28b68f5dd28f764fc6afe4d8269a Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Fri, 21 Jul 2023 16:48:58 +0800
Subject: [PATCH 17/37] [Enhance] Combined dataset supports custom sampling
 ratio (#2562)

---
 docs/en/user_guides/mixed_datasets.md         |  5 ++++
 docs/zh_cn/user_guides/mixed_datasets.md      |  3 +++
 mmpose/datasets/dataset_wrappers.py           | 25 ++++++++++++++++++-
 .../test_combined_dataset.py                  | 23 +++++++++++++++++
 4 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/docs/en/user_guides/mixed_datasets.md b/docs/en/user_guides/mixed_datasets.md
index f9bcc93e15..61d4cb3e34 100644
--- a/docs/en/user_guides/mixed_datasets.md
+++ b/docs/en/user_guides/mixed_datasets.md
@@ -84,6 +84,11 @@ dataset = dict(
     # The pipeline includes typical transforms, such as loading the
     # image and data augmentation
     pipeline=train_pipeline,
+    # The sample_ratio_factor controls the sampling ratio of
+    # each dataset in the combined dataset. The length of sample_ratio_factor
+    # should match the number of datasets. Each factor indicates the sampling
+    # ratio of the corresponding dataset relative to its original length.
+    sample_ratio_factor=[1.0, 0.5]
 )
 ```
 
diff --git a/docs/zh_cn/user_guides/mixed_datasets.md b/docs/zh_cn/user_guides/mixed_datasets.md
index fac38e3338..d2d213f06a 100644
--- a/docs/zh_cn/user_guides/mixed_datasets.md
+++ b/docs/zh_cn/user_guides/mixed_datasets.md
@@ -84,6 +84,9 @@ dataset = dict(
     # `train_pipeline` 包含了常用的数据预处理，
     # 比如图片读取、数据增广等
     pipeline=train_pipeline,
+    # sample_ratio_factor 参数是用来调节每个子数据集
+    # 在组合数据集中的样本数量比例的
+    sample_ratio_factor=[1.0, 0.5]
 )
 ```
 
diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py
index 28eeac9945..553191fd43 100644
--- a/mmpose/datasets/dataset_wrappers.py
+++ b/mmpose/datasets/dataset_wrappers.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from copy import deepcopy
-from typing import Any, Callable, List, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
+import numpy as np
 from mmengine.dataset import BaseDataset
 from mmengine.registry import build_from_cfg
 
@@ -18,21 +19,37 @@ class CombinedDataset(BaseDataset):
         metainfo (dict): The meta information of combined dataset.
         datasets (list): The configs of datasets to be combined.
         pipeline (list, optional): Processing pipeline. Defaults to [].
+        sample_ratio_factor (list, optional): A list of sampling ratio
+            factors for each dataset. Defaults to None
     """
 
     def __init__(self,
                  metainfo: dict,
                  datasets: list,
                  pipeline: List[Union[dict, Callable]] = [],
+                 sample_ratio_factor: Optional[List[float]] = None,
                  **kwargs):
 
         self.datasets = []
+        self.resample = sample_ratio_factor is not None
 
         for cfg in datasets:
             dataset = build_from_cfg(cfg, DATASETS)
             self.datasets.append(dataset)
 
         self._lens = [len(dataset) for dataset in self.datasets]
+        if self.resample:
+            assert len(sample_ratio_factor) == len(datasets), f'the length ' \
+                f'of `sample_ratio_factor` {len(sample_ratio_factor)} does ' \
+                f'not match the length of `datasets` {len(datasets)}'
+            assert min(sample_ratio_factor) >= 0.0, 'the ratio values in ' \
+                '`sample_ratio_factor` should not be negative.'
+            self._lens_ori = self._lens
+            self._lens = [
+                round(l * sample_ratio_factor[i])
+                for i, l in enumerate(self._lens_ori)
+            ]
+
         self._len = sum(self._lens)
 
         super(CombinedDataset, self).__init__(pipeline=pipeline, **kwargs)
@@ -71,6 +88,12 @@ def _get_subset_index(self, index: int) -> Tuple[int, int]:
         while index >= self._lens[subset_index]:
             index -= self._lens[subset_index]
             subset_index += 1
+
+        if self.resample:
+            gap = (self._lens_ori[subset_index] -
+                   1e-4) / self._lens[subset_index]
+            index = round(gap * index + np.random.rand() * gap - 0.5)
+
         return subset_index, index
 
     def prepare_data(self, idx: int) -> Any:
diff --git a/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py b/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py
index 698f1f060d..ff2e8aaec2 100644
--- a/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py
+++ b/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py
@@ -81,6 +81,29 @@ def test_get_subset_index(self):
         self.assertEqual(subset_idx, 0)
         self.assertEqual(sample_idx, lens[0] - 1)
 
+        # combiend dataset with resampling ratio
+        dataset = self.build_combined_dataset(sample_ratio_factor=[1, 0.3])
+        self.assertEqual(
+            len(dataset),
+            len(dataset.datasets[0]) + round(0.3 * len(dataset.datasets[1])))
+        lens = dataset._lens
+
+        index = lens[0]
+        subset_idx, sample_idx = dataset._get_subset_index(index)
+        self.assertEqual(subset_idx, 1)
+        self.assertIn(sample_idx, (0, 1, 2))
+
+        index = -lens[1] - 1
+        subset_idx, sample_idx = dataset._get_subset_index(index)
+        self.assertEqual(subset_idx, 0)
+        self.assertEqual(sample_idx, lens[0] - 1)
+
+        with self.assertRaises(AssertionError):
+            _ = self.build_combined_dataset(sample_ratio_factor=[1, 0.3, 0.1])
+
+        with self.assertRaises(AssertionError):
+            _ = self.build_combined_dataset(sample_ratio_factor=[1, -0.3])
+
     def test_prepare_data(self):
         dataset = self.build_combined_dataset()
         lens = dataset._lens

From 08bd9d7defa26683821e8e13230388d8253dfd50 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Fri, 21 Jul 2023 18:57:23 +0800
Subject: [PATCH 18/37] [Docs] Add MultiSourceSampler docs (#2563)

---
 docs/en/user_guides/mixed_datasets.md    | 50 ++++++++++++++++++++++++
 docs/zh_cn/user_guides/mixed_datasets.md | 50 ++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/docs/en/user_guides/mixed_datasets.md b/docs/en/user_guides/mixed_datasets.md
index 61d4cb3e34..9478ddfd1e 100644
--- a/docs/en/user_guides/mixed_datasets.md
+++ b/docs/en/user_guides/mixed_datasets.md
@@ -162,3 +162,53 @@ dataset = dict(
 ```
 
 Additionally, the output channel number of the model should be adjusted as the number of keypoints changes. If the users aim to evaluate the model on the COCO dataset, a subset of model outputs must be chosen. This subset can be customized using the `output_keypoint_indices` argument in `test_cfg`. Users can refer to the [config file](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py), which combines the COCO and AIC dataset, for more details and use it as a template to create their custom dataset.
+
+## Sampling Strategy for Mixed Datasets
+
+When training with mixed datasets, users often encounter the problem of inconsistent data distributions between different datasets. To address this issue, we provide two different sampling strategies:
+
+1. Adjust the sampling ratio of each sub dataset
+2. Adjust the ratio of each sub dataset in each batch
+
+### Adjust the sampling ratio of each sub dataset
+
+In `CombinedDataset`, we provide the `sample_ratio_factor` argument to adjust the sampling ratio of each sub dataset.
+
+For example:
+
+- If `sample_ratio_factor` is `[1.0, 0.5]`, then all data from the first sub dataset will be included in the training, and the second sub dataset will be sampled at a ratio of 0.5.
+- If `sample_ratio_factor` is `[1.0, 2.0]`, then all data from the first sub dataset will be included in the training, and the second sub dataset will be sampled at a ratio of 2 times its total number.
+
+### Adjust the ratio of each sub dataset in each batch
+
+In [$MMPOSE/datasets/samplers.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/samplers.py) we provide [MultiSourceSampler](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/samplers.py#L15) to adjust the ratio of each sub dataset in each batch.
+
+For example:
+
+- If `sample_ratio_factor` is `[1.0, 0.5]`, then the data volume of the first sub dataset in each batch will be `1.0 / (1.0 + 0.5) = 66.7%`, and the data volume of the second sub dataset will be `0.5 / (1.0 + 0.5) = 33.3%`. That is, the first sub dataset will be twice as large as the second sub dataset in each batch.
+
+Users can set the `sampler` argument in the configuration file:
+
+```python
+# data loaders
+train_bs = 256
+train_dataloader = dict(
+    batch_size=train_bs,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(
+        type='MultiSourceSampler',
+        batch_size=train_bs,
+        # ratio of sub datasets in each batch
+        source_ratio=[1.0, 0.5],
+        shuffle=True,
+        round_up=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+        # set sub datasets
+        datasets=[sub_dataset1, sub_dataset2],
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+```
diff --git a/docs/zh_cn/user_guides/mixed_datasets.md b/docs/zh_cn/user_guides/mixed_datasets.md
index d2d213f06a..d62ace5f5d 100644
--- a/docs/zh_cn/user_guides/mixed_datasets.md
+++ b/docs/zh_cn/user_guides/mixed_datasets.md
@@ -160,3 +160,53 @@ dataset = dict(
 ```
 
 此外，在使用混合数据集时，由于关键点数量的变化，模型的输出通道数也要做相应调整。如果用户用混合数据集训练了模型，但是要在 COCO 数据集上评估模型，就需要从模型输出的关键点中取出一个子集来匹配 COCO 中的关键点格式。可以通过 `test_cfg` 中的 `output_keypoint_indices` 参数自定义此子集。这个 [配置文件](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) 展示了如何用 AIC 和 COCO 合并后的数据集训练模型并在 COCO 数据集上进行测试。用户可以查阅这个文件以获取更多细节，或者参考这个文件来构建新的混合数据集。
+
+## 调整混合数据集采样策略
+
+在混合数据集训练中，常常面临着不同数据集的数据分布不统一问题，对此我们提供了两种不同的采样策略：
+
+1. 调整每个子数据集的采样比例
+2. 调整每个 batch 中每个子数据集的比例
+
+### 调整每个子数据集的采样比例
+
+在 `CombinedDataset` 中，我们提供了 `sample_ratio_factor` 参数来调整每个子数据集的采样比例。
+
+例如：
+
+- 如果 `sample_ratio_factor` 为 `[1.0, 0.5]`，则第一个子数据集全部数据加入训练，第二个子数据集抽样出 0.5 加入训练。
+- 如果 `sample_ratio_factor` 为 `[1.0, 2.0]`，则第一个子数据集全部数据加入训练，第二个子数据集抽样出其总数的 2 倍加入训练。
+
+### 调整每个 batch 中每个子数据集的比例
+
+在 [$MMPOSE/datasets/samplers.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/samplers.py) 中，我们提供了 [MultiSourceSampler](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/samplers.py#L15) 来调整每个 batch 中每个子数据集的比例。
+
+例如：
+
+- 如果 `sample_ratio_factor` 为 `[1.0, 0.5]`，则每个 batch 中第一个子数据集的数据量为 `1.0 / (1.0 + 0.5) = 66.7%`，第二个子数据集的数据量为 `0.5 / (1.0 + 0.5) = 33.3%`。即，第一个子数据集在 batch 中的占比为第二个子数据集的 2 倍。
+
+用户可以在配置文件中通过 `sampler` 参数来进行设置：
+
+```python
+# data loaders
+train_bs = 256
+train_dataloader = dict(
+    batch_size=train_bs,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(
+        type='MultiSourceSampler',
+        batch_size=train_bs,
+        # 设置子数据集比例
+        source_ratio=[1.0, 0.5],
+        shuffle=True,
+        round_up=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+        # 子数据集
+        datasets=[sub_dataset1, sub_dataset2],
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+```

From c94434b821707b08a437d45f77ce7afcc0a7e837 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Mon, 24 Jul 2023 11:39:16 +0800
Subject: [PATCH 19/37] [Doc] Refine docs (#2564)

---
 docs/en/advanced_guides/customize_datasets.md |  22 +-
 docs/en/guide_to_framework.md                 | 190 ++++++++---------
 docs/zh_cn/guide_to_framework.md              | 199 ++++++++----------
 3 files changed, 181 insertions(+), 230 deletions(-)

diff --git a/docs/en/advanced_guides/customize_datasets.md b/docs/en/advanced_guides/customize_datasets.md
index 68efb1b0c2..202d23c13c 100644
--- a/docs/en/advanced_guides/customize_datasets.md
+++ b/docs/en/advanced_guides/customize_datasets.md
@@ -72,19 +72,15 @@ configs/_base_/datasets/custom.py
 
 An example of the dataset config is as follows.
 
-`keypoint_info` contains the information about each keypoint.
-
-1. `name`: the keypoint name. The keypoint name must be unique.
-2. `id`: the keypoint id.
-3. `color`: (\[B, G, R\]) is used for keypoint visualization.
-4. `type`: 'upper' or 'lower', will be used in data augmentation [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/b225a773d168fc2afd48cde5f76c0202d1ba2f52/mmpose/datasets/transforms/common_transforms.py#L263).
-5. `swap`: indicates the 'swap pair' (also known as 'flip pair'). When applying image horizontal flip, the left part will become the right part, used in data augmentation [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94). We need to flip the keypoints accordingly.
-
-`skeleton_info` contains information about the keypoint connectivity, which is used for visualization.
-
-`joint_weights` assigns different loss weights to different keypoints.
-
-`sigmas` is used to calculate the OKS score. You can read [keypoints-eval](https://cocodataset.org/#keypoints-eval) to learn more about it.
+- `keypoint_info` contains the information about each keypoint.
+  1. `name`: the keypoint name. The keypoint name must be unique.
+  2. `id`: the keypoint id.
+  3. `color`: (\[B, G, R\]) is used for keypoint visualization.
+  4. `type`: 'upper' or 'lower', will be used in data augmentation [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py#L263).
+  5. `swap`: indicates the 'swap pair' (also known as 'flip pair'). When applying image horizontal flip, the left part will become the right part, used in data augmentation [RandomFlip](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py#L94). We need to flip the keypoints accordingly.
+- `skeleton_info` contains information about the keypoint connectivity, which is used for visualization.
+- `joint_weights` assigns different loss weights to different keypoints.
+- `sigmas` is used to calculate the OKS score. You can read [keypoints-eval](https://cocodataset.org/#keypoints-eval) to learn more about it.
 
 Here is an simplified example of dataset_info config file ([full text](/configs/_base_/datasets/coco.py)).
 
diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
index 1de2e68678..5d3b5513a6 100644
--- a/docs/en/guide_to_framework.md
+++ b/docs/en/guide_to_framework.md
@@ -109,9 +109,9 @@ The organization of data in MMPose contains:
 
 ### Dataset Meta Information
 
-The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under [$MMPOSE/configs/_base_/datasets](https://github.com/open-mmlab/mmpose/tree/main/configs/_base_/datasets).
+The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under [$MMPOSE/configs/\_base\_/datasets](https://github.com/open-mmlab/mmpose/tree/main/configs/_base_/datasets).
 
-To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset ([$MMPOSE/configs/_base_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)) as an example. Here is its dataset information:
+To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset ([$MMPOSE/configs/\_base\_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)) as an example. Here is its dataset information:
 
 ```Python
 dataset_info = dict(
@@ -151,6 +151,16 @@ dataset_info = dict(
     ])
 ```
 
+- `keypoint_info` contains the information about each keypoint.
+  1. `name`: the keypoint name. The keypoint name must be unique.
+  2. `id`: the keypoint id.
+  3. `color`: (\[B, G, R\]) is used for keypoint visualization.
+  4. `type`: 'upper' or 'lower', will be used in data augmentation [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py#L263).
+  5. `swap`: indicates the 'swap pair' (also known as 'flip pair'). When applying image horizontal flip, the left part will become the right part, used in data augmentation [RandomFlip](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/transforms/common_transforms.py#L94). We need to flip the keypoints accordingly.
+- `skeleton_info` contains information about the keypoint connectivity, which is used for visualization.
+- `joint_weights` assigns different loss weights to different keypoints.
+- `sigmas` is used to calculate the OKS score. You can read [keypoints-eval](https://cocodataset.org/#keypoints-eval) to learn more about it.
+
 In the model config, the user needs to specify the metainfo path of the custom dataset (e.g. `$MMPOSE/configs/_base_/datasets/custom.py`) as follows:
 
 ```python
@@ -196,116 +206,82 @@ Please refer to [COCO](./dataset_zoo/2d_body_keypoint.md) for more details about
 
 The bbox format in MMPose is in `xyxy` instead of `xywh`, which is consistent with the format used in other OpenMMLab projects like [MMDetection](https://github.com/open-mmlab/mmdetection).  We provide useful utils for bbox format conversion, such as `bbox_xyxy2xywh`, `bbox_xywh2xyxy`, `bbox_xyxy2cs`, etc., which are defined in [$MMPOSE/mmpose/structures/bbox/transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/structures/bbox/transforms.py).
 
-Let's take the implementation of the MPII dataset ([$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/mpii_dataset.py)) as an example.
+Let's take the implementation of the CrowPose dataset ([$MMPOSE/mmpose/datasets/datasets/body/crowdpose_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/crowdpose_dataset.py)) in COCO format as an example.
 
 ```Python
 @DATASETS.register_module()
-class MpiiDataset(BaseCocoStyleDataset):
-    METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py')
-
-    def __init__(self,
-                 ## omitted
-                 headbox_file: Optional[str] = None,
-                 ## omitted
-                ):
-
-        if headbox_file:
-            if data_mode != 'topdown':
-                raise ValueError(
-                    f'{self.__class__.__name__} is set to {data_mode}: '
-                    'mode, while "headbox_file" is only '
-                    'supported in topdown mode.')
-
-            if not test_mode:
-                raise ValueError(
-                    f'{self.__class__.__name__} has `test_mode==False` '
-                    'while "headbox_file" is only '
-                    'supported when `test_mode==True`.')
-
-            headbox_file_type = headbox_file[-3:]
-            allow_headbox_file_type = ['mat']
-            if headbox_file_type not in allow_headbox_file_type:
-                raise KeyError(
-                    f'The head boxes file type {headbox_file_type} is not '
-                    f'supported. Should be `mat` but got {headbox_file_type}.')
-        self.headbox_file = headbox_file
-
-        super().__init__(
-            ## omitted
-            )
-
-    def _load_annotations(self) -> List[dict]:
-        """Load data from annotations in MPII format."""
-        check_file_exist(self.ann_file)
-        with open(self.ann_file) as anno_file:
-            anns = json.load(anno_file)
-
-        if self.headbox_file:
-            check_file_exist(self.headbox_file)
-            headbox_dict = loadmat(self.headbox_file)
-            headboxes_src = np.transpose(headbox_dict['headboxes_src'],
-                                         [2, 0, 1])
-            SC_BIAS = 0.6
-
-        data_list = []
-        ann_id = 0
-
-        # mpii bbox scales are normalized with factor 200.
-        pixel_std = 200.
-
-        for idx, ann in enumerate(anns):
-            center = np.array(ann['center'], dtype=np.float32)
-            scale = np.array([ann['scale'], ann['scale']],
-                             dtype=np.float32) * pixel_std
-
-            # Adjust center/scale slightly to avoid cropping limbs
-            if center[0] != -1:
-                center[1] = center[1] + 15. / pixel_std * scale[1]
-
-            # MPII uses matlab format, index is 1-based,
-            # we should first convert to 0-based index
-            center = center - 1
-
-            # unify shape with coco datasets
-            center = center.reshape(1, -1)
-            scale = scale.reshape(1, -1)
-            bbox = bbox_cs2xyxy(center, scale)
-
-            # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K]
-            keypoints = np.array(ann['joints']).reshape(1, -1, 2)
-            keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1)
-
-            data_info = {
-                'id': ann_id,
-                'img_id': int(ann['image'].split('.')[0]),
-                'img_path': osp.join(self.data_prefix['img'], ann['image']),
-                'bbox_center': center,
-                'bbox_scale': scale,
-                'bbox': bbox,
-                'bbox_score': np.ones(1, dtype=np.float32),
-                'keypoints': keypoints,
-                'keypoints_visible': keypoints_visible,
-            }
-
-            if self.headbox_file:
-                # calculate the diagonal length of head box as norm_factor
-                headbox = headboxes_src[idx]
-                head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0)
-                head_size *= SC_BIAS
-                data_info['head_size'] = head_size.reshape(1, -1)
-
-            data_list.append(data_info)
-            ann_id = ann_id + 1
-
-        return data_list
+class CrowdPoseDataset(BaseCocoStyleDataset):
+    """CrowdPose dataset for pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    CrowdPose keypoints::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Annotation file path. Default: ''.
+        bbox_file (str, optional): Detection result file path. If
+            ``bbox_file`` is set, detected bboxes loaded from this file will
+            be used instead of ground-truth bboxes. This setting is only for
+            evaluation, i.e., ignored when ``test_mode`` is ``False``.
+            Default: ``None``.
+        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
+            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
+            one instance; while in ``'bottomup'`` mode, each data sample
+            contains all instances in a image. Default: ``'topdown'``
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Default: ``None``.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Default: ``None``.
+        data_prefix (dict, optional): Prefix for training data. Default:
+            ``dict(img=None, ann=None)``.
+        filter_cfg (dict, optional): Config for filter data. Default: `None`.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Default: ``None`` which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy.
+            Default: ``True``.
+        pipeline (list, optional): Processing pipeline. Default: [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Default: ``False``.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Default: ``False``.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Default: 1000.
+    """
+
+    METAINFO: dict = dict(from_file='configs/_base_/datasets/crowdpose.py')
 ```
 
-When supporting MPII dataset, since we need to use `head_size` to calculate `PCKh`, we add `headbox_file` to `__init__()` and override`_load_annotations()`.
+For COCO-style datasets, we only need to inherit from [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) and specify `METAINFO`, then the dataset class is ready to use.
 
-To support a dataset that is beyond the scope of [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py), you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details.
+More details about using custom datasets can be found in [Customize Datasets](./advanced_guides/customize_datasets.md).
 
 ```{note}
-If you wish to customize a new dataset, you can refer to [Customize Datasets](./advanced_guides/customize_datasets.md) for more details.
+If you wish to inherit from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to this [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details.
 ```
 
 ### Pipeline
@@ -344,7 +320,7 @@ Here is a diagram to show the workflow of data transformation among the three sc
 
 ![tour_en](https://github.com/open-mmlab/mmpose/assets/13503330/e82710e6-4181-4eb0-8185-7075b43dbec3)
 
-In MMPose, the modules used for data transformation are under `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)`, and their workflow is shown as follows:
+In MMPose, the modules used for data transformation are under [$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms), and their workflow is shown as follows:
 
 ![transforms-en](https://user-images.githubusercontent.com/13503330/187190352-a7662346-b8da-4256-9192-c7a84b15cbb5.png)
 
@@ -467,7 +443,7 @@ In MMPose 1.0, the model consists of the following components:
 
 - **Head**: used to implement the core algorithm and loss function
 
-We define a base class `BasePoseEstimator` for the model in [$MMPOSE/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py). All models, e.g. `TopdownPoseEstimator`, should inherit from this base class and override the corresponding methods.
+We define a base class [BasePoseEstimator](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/pose_estimators/base.py) for the model in [$MMPOSE/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py). All models, e.g. [TopdownPoseEstimator](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/pose_estimators/topdown.py), should inherit from this base class and override the corresponding methods.
 
 Three modes are provided in `forward()` of the estimator:
 
@@ -569,7 +545,7 @@ Neck is usually a module between Backbone and Head, which is used in some algori
 
 - Feature Map Processor (FMP)
 
-  The `FeatureMapProcessor` is a flexible PyTorch module designed to transform the feature outputs generated by backbones into a format suitable for heads. It achieves this by utilizing non-parametric operations such as selecting, concatenating, and rescaling. Below are some examples along with their corresponding configurations:
+  The [FeatureMapProcessor](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/necks/fmap_proc_neck.py) is a flexible PyTorch module designed to transform the feature outputs generated by backbones into a format suitable for heads. It achieves this by utilizing non-parametric operations such as selecting, concatenating, and rescaling. Below are some examples along with their corresponding configurations:
 
   - Select operation
 
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index a440b4871d..f9f85c8e61 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -97,25 +97,25 @@ mmpose
 
 ## Step1：配置文件
 
-在MMPose中，我们通常 python 格式的配置文件，用于整个项目的定义、参数管理，因此我们强烈建议第一次接触 MMPose 的开发者，查阅 [配置文件](./user_guides/configs.md) 学习配置文件的定义。
+在MMPose中，我们通常 python 格式的配置文件，用于整个项目的定义、参数管理，因此我们强烈建议第一次接触 MMPose 的开发者，查阅 [【用户教程 - 如何看懂配置文件】](./user_guides/configs.md) 学习配置文件的定义。
 
-需要注意的是，所有新增的模块都需要使用注册器（Registry）进行注册，并在对应目录的 `__init__.py` 中进行 `import`，以便能够使用配置文件构建其实例。
+需要注意的是，所有新增的模块都需要使用注册器进行注册，并在对应目录的 `__init__.py` 中进行 `import`，以便能够使用配置文件构建其实例。
 
 ## Step2：数据
 
 MMPose 数据的组织主要包含三个方面：
 
-- 数据集元信息
+- 数据集元信息（meta info）
 
-- 数据集
+- 数据集（dataset）
 
-- 数据流水线
+- 数据流水线（pipeline）
 
 ### 数据集元信息
 
 元信息指具体标注之外的数据集信息。姿态估计数据集的元信息通常包括：关键点和骨骼连接的定义、对称性、关键点性质（如关键点权重、标注标准差、所属上下半身）等。这些信息在数据在数据处理、模型训练和测试中有重要作用。在 MMPose 中，数据集的元信息使用 python 格式的配置文件保存，位于 [$MMPOSE/configs/_base_/datasets](https://github.com/open-mmlab/mmpose/tree/main/configs/_base_/datasets) 目录下。
 
-在 MMPose 中使用自定义数据集时，你需要增加对应的元信息配置文件。以 MPII 数据集（[$MMPOSE/configs/_base_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)）为例：
+在 MMPose 中使用自定义数据集时，你需要增加对应的元信息配置文件。以 MPII 数据集（[$MMPOSE/configs/\_base\_/datasets/mpii.py](https://github.com/open-mmlab/mmpose/blob/main/configs/_base_/datasets/mpii.py)）为例：
 
 ```Python
 dataset_info = dict(
@@ -155,7 +155,19 @@ dataset_info = dict(
     ])
 ```
 
-在模型配置文件中，你需要为自定义数据集指定对应的元信息配置文件。假如该元信息配置文件路径为 `$MMPOSE/configs/_base_/datasets/custom.py`，指定方式如下：
+在这份元信息配置文件中：
+
+- `keypoint_info`：每个关键点的信息：
+  1. `name`: 关键点名称，必须是唯一的，例如 `nose`、`left_eye` 等。
+  2. `id`: 关键点 ID，必须是唯一的，从 0 开始。
+  3. `color`: 关键点可视化时的颜色，以 (\[B, G, R\]) 格式组织起来，用于可视化。
+  4. `type`: 关键点类型，可以是 `upper`、`lower` 或 `''`，用于数据增强 [RandomHalfBody](https://github.com/open-mmlab/mmpose/blob/b225a773d168fc2afd48cde5f76c0202d1ba2f52/mmpose/datasets/transforms/common_transforms.py#L263)。
+  5. `swap`: 关键点交换关系，用于水平翻转数据增强 [RandomFlip](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/common_transforms.py#L94)。
+- `skeleton_info`：骨架连接关系，用于可视化。
+- `joint_weights`：每个关键点的权重，用于损失函数计算。
+- `sigma`：标准差，用于计算 OKS 分数，详细信息请参考 [keypoints-eval](https://cocodataset.org/#keypoints-eval)。
+
+在模型配置文件中，你需要为自定义数据集指定对应的元信息配置文件。假如该元信息配置文件路径为 `$MMPOSE/configs/\_base\_/datasets/custom.py`，指定方式如下：
 
 ```python
 # dataset and dataloader settings
@@ -197,115 +209,82 @@ MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**，为此
 
 在 MMPose 中 bbox 的数据格式采用 `xyxy`，而不是 `xywh`，这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换，我们提供了丰富的函数：`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在 [$MMPOSE/mmpose/structures/bbox/transforms.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/structures/bbox/transforms.py)。
 
-下面我们以MPII数据集的实现（[$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/mpii_dataset.py)）为例：
+下面我们以 COCO 格式标注的 CrowdPose 数据集的实现（[$MMPOSE/mmpose/datasets/datasets/body/crowdpose_dataset.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/body/crowdpose_dataset.py)）为例：
 
 ```Python
 @DATASETS.register_module()
-class MpiiDataset(BaseCocoStyleDataset):
-    METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py')
+class CrowdPoseDataset(BaseCocoStyleDataset):
+    """CrowdPose dataset for pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    CrowdPose keypoints::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Annotation file path. Default: ''.
+        bbox_file (str, optional): Detection result file path. If
+            ``bbox_file`` is set, detected bboxes loaded from this file will
+            be used instead of ground-truth bboxes. This setting is only for
+            evaluation, i.e., ignored when ``test_mode`` is ``False``.
+            Default: ``None``.
+        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
+            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
+            one instance; while in ``'bottomup'`` mode, each data sample
+            contains all instances in a image. Default: ``'topdown'``
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Default: ``None``.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Default: ``None``.
+        data_prefix (dict, optional): Prefix for training data. Default:
+            ``dict(img=None, ann=None)``.
+        filter_cfg (dict, optional): Config for filter data. Default: `None`.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Default: ``None`` which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy.
+            Default: ``True``.
+        pipeline (list, optional): Processing pipeline. Default: [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Default: ``False``.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Default: ``False``.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Default: 1000.
+    """
 
-    def __init__(self,
-                 ## 内容省略
-                 headbox_file: Optional[str] = None,
-                 ## 内容省略):
-
-        if headbox_file:
-            if data_mode != 'topdown':
-                raise ValueError(
-                    f'{self.__class__.__name__} is set to {data_mode}: '
-                    'mode, while "headbox_file" is only '
-                    'supported in topdown mode.')
-
-            if not test_mode:
-                raise ValueError(
-                    f'{self.__class__.__name__} has `test_mode==False` '
-                    'while "headbox_file" is only '
-                    'supported when `test_mode==True`.')
-
-            headbox_file_type = headbox_file[-3:]
-            allow_headbox_file_type = ['mat']
-            if headbox_file_type not in allow_headbox_file_type:
-                raise KeyError(
-                    f'The head boxes file type {headbox_file_type} is not '
-                    f'supported. Should be `mat` but got {headbox_file_type}.')
-        self.headbox_file = headbox_file
-
-        super().__init__(
-            ## 内容省略
-            )
-
-    def _load_annotations(self) -> List[dict]:
-        """Load data from annotations in MPII format."""
-        check_file_exist(self.ann_file)
-        with open(self.ann_file) as anno_file:
-            anns = json.load(anno_file)
-
-        if self.headbox_file:
-            check_file_exist(self.headbox_file)
-            headbox_dict = loadmat(self.headbox_file)
-            headboxes_src = np.transpose(headbox_dict['headboxes_src'],
-                                         [2, 0, 1])
-            SC_BIAS = 0.6
-
-        data_list = []
-        ann_id = 0
-
-        # mpii bbox scales are normalized with factor 200.
-        pixel_std = 200.
-
-        for idx, ann in enumerate(anns):
-            center = np.array(ann['center'], dtype=np.float32)
-            scale = np.array([ann['scale'], ann['scale']],
-                             dtype=np.float32) * pixel_std
-
-            # Adjust center/scale slightly to avoid cropping limbs
-            if center[0] != -1:
-                center[1] = center[1] + 15. / pixel_std * scale[1]
-
-            # MPII uses matlab format, index is 1-based,
-            # we should first convert to 0-based index
-            center = center - 1
-
-            # unify shape with coco datasets
-            center = center.reshape(1, -1)
-            scale = scale.reshape(1, -1)
-            bbox = bbox_cs2xyxy(center, scale)
-
-            # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K]
-            keypoints = np.array(ann['joints']).reshape(1, -1, 2)
-            keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1)
-
-            data_info = {
-                'id': ann_id,
-                'img_id': int(ann['image'].split('.')[0]),
-                'img_path': osp.join(self.data_prefix['img'], ann['image']),
-                'bbox_center': center,
-                'bbox_scale': scale,
-                'bbox': bbox,
-                'bbox_score': np.ones(1, dtype=np.float32),
-                'keypoints': keypoints,
-                'keypoints_visible': keypoints_visible,
-            }
-
-            if self.headbox_file:
-                # calculate the diagonal length of head box as norm_factor
-                headbox = headboxes_src[idx]
-                head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0)
-                head_size *= SC_BIAS
-                data_info['head_size'] = head_size.reshape(1, -1)
-
-            data_list.append(data_info)
-            ann_id = ann_id + 1
-
-        return data_list
+    METAINFO: dict = dict(from_file='configs/_base_/datasets/crowdpose.py')
 ```
 
-在对MPII数据集进行支持时，由于MPII需要读入 `head_size` 信息来计算 `PCKh`，因此我们在 `__init__()` 中增加了 `headbox_file`，并重载了 `_load_annotations()` 来完成数据组织。
+对于使用 COCO 格式标注的数据集，只需要继承 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) 并指定 `METAINFO`，就可以十分轻松地集成到 MMPose 中参与训练。
 
-如果自定义数据集无法被 [BaseCocoStyleDataset](https://github.com/open-mmlab/mmpose/blob/main/mmpose/datasets/datasets/base/base_coco_style_dataset.py) 支持，你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。
+更多自定义数据集的使用方式，请前往 [【进阶教程 - 自定义数据集】](./advanced_guides/customize_datasets.md)。
 
 ```{note}
-如果你想自定义数据集，请参考 [自定义数据集](./advanced_guides/customize_datasets.md)。
+如果你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)
 ```
 
 ### 数据流水线
@@ -344,7 +323,7 @@ test_pipeline = [
 
 ![tour_cn](https://github.com/open-mmlab/mmpose/assets/13503330/4c989d86-e824-49ea-9ba8-b3978548db37)
 
-在MMPose中，数据变换所需要的模块在 `[$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms)` 目录下，它们的工作流程如图所示：
+在MMPose中，数据变换所需要的模块在 [$MMPOSE/mmpose/datasets/transforms](https://github.com/open-mmlab/mmpose/tree/main/mmpose/datasets/transforms) 目录下，它们的工作流程如图所示：
 
 ![transforms-cn](https://user-images.githubusercontent.com/13503330/187831611-8db89e20-95c7-42bc-8b0d-700fadf60328.png)
 
@@ -479,7 +458,7 @@ def get_pose_data_sample(self):
 
 - **预测头（Head）**：用于实现核心算法功能和损失函数定义
 
-我们在 [$MMPOSE/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py) 下为姿态估计模型定义了一个基类 `BasePoseEstimator`，所有的模型（如 `TopdownPoseEstimator`）都需要继承这个基类，并重载对应的方法。
+我们在 [$MMPOSE/mmpose/models/pose_estimators/base.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/pose_estimators/base.py) 下为姿态估计模型定义了一个基类 [BasePoseEstimator](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/pose_estimators/base.py)，所有的模型（如 [TopdownPoseEstimator](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/pose_estimators/topdown.py)）都需要继承这个基类，并重载对应的方法。
 
 在模型的 `forward()` 方法中提供了三种不同的模式：
 
@@ -581,7 +560,7 @@ MMPose 中 Neck 相关的模块定义在 [$MMPOSE/mmpose/models/necks](https://g
 
 - Feature Map Processor (FMP)
 
-  `FeatureMapProcessor` 是一个通用的 PyTorch 模块，旨在通过选择、拼接和缩放等非参数变换将主干网络输出的特征图转换成适合预测头的格式。以下是一些操作的配置方式及效果示意图:
+  [FeatureMapProcessor](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/necks/fmap_proc_neck.py) 是一个通用的 PyTorch 模块，旨在通过选择、拼接和缩放等非参数变换将主干网络输出的特征图转换成适合预测头的格式。以下是一些操作的配置方式及效果示意图:
 
   - 选择操作
 

From 85831b8ae0a869041aca898d29b04018edc8db26 Mon Sep 17 00:00:00 2001
From: Xin Li <7219519+xin-li-67@users.noreply.github.com>
Date: Mon, 24 Jul 2023 19:11:30 +0800
Subject: [PATCH 20/37] [Feature][MMSIG] Add UniFormer Pose Estimation to
 Projects folder (#2501)

---
 projects/uniformer/README.md                  | 138 ++++
 ...hm_uniformer-b-8xb128-210e_coco-256x192.py | 135 ++++
 ...-hm_uniformer-b-8xb32-210e_coco-384x288.py | 134 ++++
 ...-hm_uniformer-b-8xb32-210e_coco-448x320.py | 134 ++++
 ...hm_uniformer-s-8xb128-210e_coco-256x192.py |  17 +
 ...hm_uniformer-s-8xb128-210e_coco-384x288.py |  23 +
 ...-hm_uniformer-s-8xb64-210e_coco-448x320.py |  22 +
 projects/uniformer/models/__init__.py         |   1 +
 projects/uniformer/models/uniformer.py        | 709 ++++++++++++++++++
 9 files changed, 1313 insertions(+)
 create mode 100644 projects/uniformer/README.md
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-384x288.py
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-448x320.py
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-384x288.py
 create mode 100644 projects/uniformer/configs/td-hm_uniformer-s-8xb64-210e_coco-448x320.py
 create mode 100644 projects/uniformer/models/__init__.py
 create mode 100644 projects/uniformer/models/uniformer.py

diff --git a/projects/uniformer/README.md b/projects/uniformer/README.md
new file mode 100644
index 0000000000..6f166f975e
--- /dev/null
+++ b/projects/uniformer/README.md
@@ -0,0 +1,138 @@
+# Pose Estion with UniFormer
+
+This project implements a topdown heatmap based human pose estimator, utilizing the approach outlined in **UniFormer: Unifying Convolution and Self-attention for Visual Recognition** (TPAMI 2023) and **UniFormer: Unified Transformer for Efficient Spatiotemporal Representation Learning** (ICLR 2022).
+
+<img src="https://raw.githubusercontent.com/Sense-X/UniFormer/main/figures/framework.png" alt><br>
+
+<img src="https://raw.githubusercontent.com/Sense-X/UniFormer/main/figures/dense_adaption.jpg" alt><br>
+
+## Usage
+
+### Preparation
+
+1. Setup Development Environment
+
+- Python 3.7 or higher
+- PyTorch 1.6 or higher
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.6.0 or higher
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4 or higher
+- [MMDetection](https://github.com/open-mmlab/mmdetection) v3.0.0rc6 or higher
+- [MMPose](https://github.com/open-mmlab/mmpose) v1.0.0rc1 or higher
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. **In `uniformer/` root directory**, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+2. Download Pretrained Weights
+
+To either run inferences or train on the `uniformer pose estimation` project, you have to download the original Uniformer pretrained weights on the ImageNet1k dataset and the weights trained for the downstream pose estimation task. The original ImageNet1k weights are hosted on SenseTime's [huggingface repository](https://huggingface.co/Sense-X/uniformer_image), and the downstream pose estimation task weights are hosted either on Google Drive or Baiduyun. We have uploaded them to the OpenMMLab download URLs, allowing users to use them without burden. For example, you can take a look at [`td-hm_uniformer-b-8xb128-210e_coco-256x192.py`](./configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py#62), the corresponding pretrained weight URL is already here and when the training or testing process starts, the weight will be automatically downloaded to your device. For the downstream task weights, you can get their URLs from the [benchmark result table](#results).
+
+### Inference
+
+We have provided a [inferencer_demo.py](../../demo/inferencer_demo.py) with which developers can utilize to run quick inference demos. Here is a basic demonstration:
+
+```shell
+python demo/inferencer_demo.py $INPUTS \
+    --pose2d $CONFIG --pose2d-weights $CHECKPOINT \
+    [--show] [--vis-out-dir $VIS_OUT_DIR] [--pred-out-dir $PRED_OUT_DIR]
+```
+
+For more information on using the inferencer, please see [this document](https://mmpose.readthedocs.io/en/latest/user_guides/inference.html#out-of-the-box-inferencer).
+
+Here's an example code:
+
+```shell
+python demo/inferencer_demo.py tests/data/coco/000000000785.jpg \
+    --pose2d projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py \
+    --pose2d-weights https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_256x192_global_small-d4a7fdac_20230724.pth \
+    --vis-out-dir vis_results
+```
+
+Then you will find the demo result in `vis_results` folder, and it may be similar to this:
+
+<img src="https://github.com/open-mmlab/mmpose/assets/7219519/6f939457-d714-477a-9cc7-27aa98acc4af" height="360px" alt><br>
+
+### Training and Testing
+
+1. Data Preparation
+
+Prepare the COCO dataset according to the [instruction](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco).
+
+2. To Train and Test with Single GPU:
+
+```shell
+python tools/test.py $CONFIG --auto-scale-lr
+```
+
+```shell
+python tools/test.py $CONFIG $CHECKPOINT
+```
+
+3. To Train and Test with Multiple GPUs:
+
+```shell
+bash tools/dist_train.sh $CONFIG $NUM_GPUs --amp
+```
+
+```shell
+bash tools/dist_test.sh $CONFIG $CHECKPOINT $NUM_GPUs --amp
+```
+
+## Results
+
+Here is the testing results on COCO val2017:
+
+|                                Model                                | Input Size |  AP  | AP<sup>50</sup> | AP<sup>75</sup> |  AR  | AR<sup>50</sup> |                                Download                                |
+| :-----------------------------------------------------------------: | :--------: | :--: | :-------------: | :-------------: | :--: | :-------------: | :--------------------------------------------------------------------: |
+| [UniFormer-S](./configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py) |  256x192   | 74.0 |      90.2       |      82.1       | 79.5 |      94.1       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_256x192_global_small-d4a7fdac_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_256x192_global_small-d4a7fdac_20230724.log.json) |
+| [UniFormer-S](./configs/td-hm_uniformer-s-8xb128-210e_coco-384x288.py) |  384x288   | 75.9 |      90.6       |      83.0       | 81.0 |      94.3       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_384x288_global_small-7a613f78_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_384x288_global_small-7a613f78_20230724.log.json) |
+| [UniFormer-S](./configs/td-hm_uniformer-s-8xb64-210e_coco-448x320.py) |  448x320   | 76.2 |      90.6       |      83.2       | 81.4 |      94.4       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_448x320_global_small-18b760de_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_448x320_global_small-18b760de_20230724.log.json) |
+| [UniFormer-B](./configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py) |  256x192   | 75.0 |      90.5       |      83.0       | 80.4 |      94.2       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_256x192_global_base-1713bcd4_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_256x192_global_base-1713bcd4_20230724.log.json) |
+| [UniFormer-B](./configs/td-hm_uniformer-b-8xb32-210e_coco-384x288.py) |  384x288   | 76.7 |      90.8       |      84.1       | 81.9 |      94.6       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_384x288_global_base-c650da38_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_384x288_global_base-c650da38_20230724.log.json) |
+| [UniFormer-B](./configs/td-hm_uniformer-b-8xb32-210e_coco-448x320.py) |  448x320   | 77.4 |      91.0       |      84.4       | 82.5 |      94.9       | [model](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_448x320_global_base-a05c185f_20230724.pth) \| [log](https://download.openmmlab.com/mmpose/v1/projects/uniformer/top_down_448x320_global_base-a05c185f_20230724.log.json) |
+
+Here is the testing results on COCO val 2017 from the official UniFormer Pose Estimation repository for comparison:
+
+| Backbone    | Input Size | AP   | AP<sup>50</sup> | AP<sup>75</sup> | AR<sup>M</sup> | AR<sup>L</sup> | AR   | Model                                                     | Log                                                      |
+| :---------- | :--------- | :--- | :-------------- | :-------------- | :------------- | :------------- | :--- | :-------------------------------------------------------- | :------------------------------------------------------- |
+| UniFormer-S | 256x192    | 74.0 | 90.3            | 82.2            | 66.8           | 76.7           | 79.5 | [google](https://drive.google.com/file/d/162R0JuTpf3gpLe1IK6oxRoQK7JSj4ylx/view?usp=sharing) | [google](https://drive.google.com/file/d/15j40u97Db6TA2gMHdn0yFEsDFb5SMBy4/view?usp=sharing) |
+| UniFormer-S | 384x288    | 75.9 | 90.6            | 83.4            | 68.6           | 79.0           | 81.4 | [google](https://drive.google.com/file/d/163vuFkpcgVOthC05jCwjGzo78Nr0eikW/view?usp=sharing) | [google](https://drive.google.com/file/d/15X9M_5cq9RQMgs64Yn9YvV5k5f0zOBHo/view?usp=sharing) |
+| UniFormer-S | 448x320    | 76.2 | 90.6            | 83.2            | 68.6           | 79.4           | 81.4 | [google](https://drive.google.com/file/d/165nQRsT58SXJegcttksHwDn46Fme5dGX/view?usp=sharing) | [google](https://drive.google.com/file/d/15IJjSWp4R5OybMdV2CZEUx_TwXdTMOee/view?usp=sharing) |
+| UniFormer-B | 256x192    | 75.0 | 90.6            | 83.0            | 67.8           | 77.7           | 80.4 | [google](https://drive.google.com/file/d/15tzJaRyEzyWp2mQhpjDbBzuGoyCaJJ-2/view?usp=sharing) | [google](https://drive.google.com/file/d/15jJyTPcJKj_id0PNdytloqt7yjH2M8UR/view?usp=sharing) |
+| UniFormer-B | 384x288    | 76.7 | 90.8            | 84.0            | 69.3           | 79.7           | 81.4 | [google](https://drive.google.com/file/d/15qtUaOR_C7-vooheJE75mhA9oJQt3gSx/view?usp=sharing) | [google](https://drive.google.com/file/d/15L1Uxo_uRSMlGnOvWzAzkJLKX6Qh_xNw/view?usp=sharing) |
+| UniFormer-B | 448x320    | 77.4 | 91.1            | 84.4            | 70.2           | 80.6           | 82.5 | [google](https://drive.google.com/file/d/156iNxetiCk8JJz41aFDmFh9cQbCaMk3D/view?usp=sharing) | [google](https://drive.google.com/file/d/15aRpZc2Tie5gsn3_l-aXto1MrC9wyzMC/view?usp=sharing) |
+
+Note:
+
+1. All the original models are pretrained on ImageNet-1K without Token Labeling and Layer Scale, as mentioned in the [official README](https://github.com/Sense-X/UniFormer/tree/main/pose_estimation) . The official team has confirmed that **Token labeling can largely improve the performance of the downstream tasks**. Developers can utilize the implementation by themselves.
+2. The original implementation did not include the **freeze BN in the backbone**. The official team has confirmed that this can improve the performance as well.
+3. To avoid running out of memory, developers can use `torch.utils.checkpoint` in the `config.py` by setting `use_checkpoint=True` and `checkpoint_num=[0, 0, 2, 0] # index for using checkpoint in every stage`
+4. We warmly welcome any contributions if you can successfully reproduce the results from the paper!
+
+## Citation
+
+If this project benefits your work, please kindly consider citing the original papers:
+
+```bibtex
+@misc{li2022uniformer,
+      title={UniFormer: Unifying Convolution and Self-attention for Visual Recognition},
+      author={Kunchang Li and Yali Wang and Junhao Zhang and Peng Gao and Guanglu Song and Yu Liu and Hongsheng Li and Yu Qiao},
+      year={2022},
+      eprint={2201.09450},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+```bibtex
+@misc{li2022uniformer,
+      title={UniFormer: Unified Transformer for Efficient Spatiotemporal Representation Learning},
+      author={Kunchang Li and Yali Wang and Peng Gao and Guanglu Song and Yu Liu and Hongsheng Li and Yu Qiao},
+      year={2022},
+      eprint={2201.04676},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/projects/uniformer/configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py b/projects/uniformer/configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py
new file mode 100644
index 0000000000..07f1377842
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-b-8xb128-210e_coco-256x192.py
@@ -0,0 +1,135 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+custom_imports = dict(imports='projects.uniformer.models')
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# enable DDP training when pretrained model is used
+find_unused_parameters = True
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(save_best='coco/AP', rule='greater', interval=5))
+
+# codec settings
+codec = dict(
+    type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='UniFormer',
+        embed_dims=[64, 128, 320, 512],
+        depths=[5, 8, 20, 7],
+        head_dim=64,
+        drop_path_rate=0.4,
+        use_checkpoint=False,  # whether use torch.utils.checkpoint
+        use_window=False,  # whether use window MHRA
+        use_hybrid=False,  # whether use hybrid MHRA
+        init_cfg=dict(
+            # Set the path to pretrained backbone here
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_base_in1k.pth'  # noqa
+        )),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=512,
+        out_channels=17,
+        final_layer=dict(kernel_size=1),
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=128,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=256,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-384x288.py b/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000000..d43061d0cd
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-384x288.py
@@ -0,0 +1,134 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+custom_imports = dict(imports='projects.uniformer.models')
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# enable DDP training when pretrained model is used
+find_unused_parameters = True
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+    type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='UniFormer',
+        embed_dims=[64, 128, 320, 512],
+        depths=[5, 8, 20, 7],
+        head_dim=64,
+        drop_path_rate=0.4,
+        use_checkpoint=False,  # whether use torch.utils.checkpoint
+        use_window=False,  # whether use window MHRA
+        use_hybrid=False,  # whether use hybrid MHRA
+        init_cfg=dict(
+            # Set the path to pretrained backbone here
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_base_in1k.pth'  # noqa
+        )),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=512,
+        out_channels=17,
+        final_layer=dict(kernel_size=1),
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=128,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=256,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-448x320.py b/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-448x320.py
new file mode 100644
index 0000000000..81554ad27e
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-b-8xb32-210e_coco-448x320.py
@@ -0,0 +1,134 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+custom_imports = dict(imports='projects.uniformer.models')
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# enable DDP training when pretrained model is used
+find_unused_parameters = True
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+    type='MSRAHeatmap', input_size=(320, 448), heatmap_size=(80, 112), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='UniFormer',
+        embed_dims=[64, 128, 320, 512],
+        depths=[5, 8, 20, 7],
+        head_dim=64,
+        drop_path_rate=0.55,
+        use_checkpoint=False,  # whether use torch.utils.checkpoint
+        use_window=False,  # whether use window MHRA
+        use_hybrid=False,  # whether use hybrid MHRA
+        init_cfg=dict(
+            # Set the path to pretrained backbone here
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_base_in1k.pth'  # noqa
+        )),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=512,
+        out_channels=17,
+        final_layer=dict(kernel_size=1),
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=256,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py b/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py
new file mode 100644
index 0000000000..54994893dd
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-256x192.py
@@ -0,0 +1,17 @@
+_base_ = ['./td-hm_uniformer-b-8xb128-210e_coco-256x192.py']
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+model = dict(
+    backbone=dict(
+        depths=[3, 4, 8, 3],
+        drop_path_rate=0.2,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_small_in1k.pth'  # noqa
+        )))
+
+train_dataloader = dict(batch_size=32)
+val_dataloader = dict(batch_size=256)
diff --git a/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-384x288.py b/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-384x288.py
new file mode 100644
index 0000000000..59f68946ef
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-s-8xb128-210e_coco-384x288.py
@@ -0,0 +1,23 @@
+_base_ = ['./td-hm_uniformer-b-8xb32-210e_coco-384x288.py']
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=2e-3,
+))
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+model = dict(
+    backbone=dict(
+        depths=[3, 4, 8, 3],
+        drop_path_rate=0.2,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_small_in1k.pth'  # noqa
+        )))
+
+train_dataloader = dict(batch_size=128)
+val_dataloader = dict(batch_size=256)
diff --git a/projects/uniformer/configs/td-hm_uniformer-s-8xb64-210e_coco-448x320.py b/projects/uniformer/configs/td-hm_uniformer-s-8xb64-210e_coco-448x320.py
new file mode 100644
index 0000000000..0359ac6d63
--- /dev/null
+++ b/projects/uniformer/configs/td-hm_uniformer-s-8xb64-210e_coco-448x320.py
@@ -0,0 +1,22 @@
+_base_ = ['./td-hm_uniformer-b-8xb32-210e_coco-448x320.py']
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=1.0e-3,
+))
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+model = dict(
+    backbone=dict(
+        depths=[3, 4, 8, 3],
+        drop_path_rate=0.2,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'uniformer/uniformer_small_in1k.pth')))
+
+train_dataloader = dict(batch_size=64)
+val_dataloader = dict(batch_size=256)
diff --git a/projects/uniformer/models/__init__.py b/projects/uniformer/models/__init__.py
new file mode 100644
index 0000000000..6256db6f45
--- /dev/null
+++ b/projects/uniformer/models/__init__.py
@@ -0,0 +1 @@
+from .uniformer import *  # noqa
diff --git a/projects/uniformer/models/uniformer.py b/projects/uniformer/models/uniformer.py
new file mode 100644
index 0000000000..cea36f061b
--- /dev/null
+++ b/projects/uniformer/models/uniformer.py
@@ -0,0 +1,709 @@
+from collections import OrderedDict
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.transformer import build_dropout
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner import checkpoint, load_checkpoint
+from mmengine.utils import to_2tuple
+
+from mmpose.models.backbones.base_backbone import BaseBackbone
+from mmpose.registry import MODELS
+from mmpose.utils import get_root_logger
+
+
+class Mlp(BaseModule):
+    """Multilayer perceptron.
+
+    Args:
+        in_features (int): Number of input features.
+        hidden_features (int): Number of hidden features.
+            Defaults to None.
+        out_features (int): Number of output features.
+            Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: int = None,
+                 out_features: int = None,
+                 drop_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop_rate)
+
+    def forward(self, x):
+        x = self.act(self.fc1(x))
+        x = self.fc2(self.drop(x))
+        x = self.drop(x)
+        return x
+
+
+class CMlp(BaseModule):
+    """Multilayer perceptron via convolution.
+
+    Args:
+        in_features (int): Number of input features.
+        hidden_features (int): Number of hidden features.
+            Defaults to None.
+        out_features (int): Number of output features.
+            Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: int = None,
+                 out_features: int = None,
+                 drop_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act = nn.GELU()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.drop = nn.Dropout(drop_rate)
+
+    def forward(self, x):
+        x = self.act(self.fc1(x))
+        x = self.fc2(self.drop(x))
+        x = self.drop(x)
+        return x
+
+
+class CBlock(BaseModule):
+    """Convolution Block.
+
+    Args:
+        embed_dim (int): Number of input features.
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        drop (float): Dropout rate.
+            Defaults to 0.0.
+        drop_paths (float): Stochastic depth rates.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 mlp_ratio: float = 4.,
+                 drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.pos_embed = nn.Conv2d(
+            embed_dim, embed_dim, 3, padding=1, groups=embed_dim)
+        self.norm1 = nn.BatchNorm2d(embed_dim)
+        self.conv1 = nn.Conv2d(embed_dim, embed_dim, 1)
+        self.conv2 = nn.Conv2d(embed_dim, embed_dim, 1)
+        self.attn = nn.Conv2d(
+            embed_dim, embed_dim, 5, padding=2, groups=embed_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is
+        # better than dropout here
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2d(embed_dim)
+        mlp_hidden_dim = int(embed_dim * mlp_ratio)
+        self.mlp = CMlp(
+            in_features=embed_dim,
+            hidden_features=mlp_hidden_dim,
+            drop_rate=drop_rate)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        x = x + self.drop_path(
+            self.conv2(self.attn(self.conv1(self.norm1(x)))))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Attention(BaseModule):
+    """Self-Attention.
+
+    Args:
+        embed_dim (int): Number of input features.
+        num_heads (int): Number of attention heads.
+            Defaults to 8.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop_rate (float): Attention dropout rate.
+            Defaults to 0.0.
+        proj_drop_rate (float): Dropout rate.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 num_heads: int = 8,
+                 qkv_bias: bool = True,
+                 qk_scale: float = None,
+                 attn_drop_rate: float = 0.,
+                 proj_drop_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        head_dim = embed_dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually
+        # to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(embed_dim, embed_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        img_size (int): Number of input size.
+            Defaults to 224.
+        patch_size (int): Number of patch size.
+            Defaults to 16.
+        in_channels (int): Number of input features.
+            Defaults to 3.
+        embed_dims (int): Number of output features.
+            Defaults to 768.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_channels: int = 3,
+                 embed_dim: int = 768,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.norm = nn.LayerNorm(embed_dim)
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, _, H, W = x.shape
+        x = self.proj(x)
+        B, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        return x
+
+
+class SABlock(BaseModule):
+    """Self-Attention Block.
+
+    Args:
+        embed_dim (int): Number of input features.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        attn_drop (float): Attention dropout rate. Defaults to 0.0.
+        drop_paths (float): Stochastic depth rates.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 num_heads: int,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: float = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pos_embed = nn.Conv2d(
+            embed_dim, embed_dim, 3, padding=1, groups=embed_dim)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.attn = Attention(
+            embed_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate)
+        # NOTE: drop path for stochastic depth,
+        # we shall see if this is better than dropout here
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+        self.norm2 = nn.LayerNorm(embed_dim)
+        mlp_hidden_dim = int(embed_dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=embed_dim,
+            hidden_features=mlp_hidden_dim,
+            drop_rate=drop_rate)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        B, N, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x = x.transpose(1, 2).reshape(B, N, H, W)
+        return x
+
+
+class WindowSABlock(BaseModule):
+    """Self-Attention Block.
+
+    Args:
+        embed_dim (int): Number of input features.
+        num_heads (int): Number of attention heads.
+        window_size (int): Size of the partition window. Defaults to 14.
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        attn_drop (float): Attention dropout rate. Defaults to 0.0.
+        drop_paths (float): Stochastic depth rates.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 num_heads: int,
+                 window_size: int = 14,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: float = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.windows_size = window_size
+        self.pos_embed = nn.Conv2d(
+            embed_dim, embed_dim, 3, padding=1, groups=embed_dim)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.attn = Attention(
+            embed_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate)
+        # NOTE: drop path for stochastic depth,
+        # we shall see if this is better than dropout here
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+        # self.norm2 = build_dropout(norm_cfg, embed_dims)[1]
+        self.norm2 = nn.LayerNorm(embed_dim)
+        mlp_hidden_dim = int(embed_dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=embed_dim,
+            hidden_features=mlp_hidden_dim,
+            drop_rate=drop_rate)
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4,
+                            5).contiguous().view(-1, window_size, window_size,
+                                                 C)
+        return windows
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        x = x.permute(0, 2, 3, 1)
+        B, H, W, C = x.shape
+        shortcut = x
+        x = self.norm1(x)
+
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, H_pad, W_pad, _ = x.shape
+
+        x_windows = self.window_partition(
+            x)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        x = self.window_reverse(attn_windows, H_pad, W_pad)  # B H' W' C
+
+        # reverse cyclic shift
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
+        return x
+
+
+@MODELS.register_module()
+class UniFormer(BaseBackbone):
+    """The implementation of Uniformer with downstream pose estimation task.
+
+    UniFormer: Unifying Convolution and Self-attention for Visual Recognition
+      https://arxiv.org/abs/2201.09450
+    UniFormer: Unified Transformer for Efficient Spatiotemporal Representation
+      Learning https://arxiv.org/abs/2201.04676
+
+    Args:
+        depths (List[int]): number of block in each layer.
+            Default to [3, 4, 8, 3].
+        img_size (int, tuple): input image size. Default: 224.
+        in_channels (int): number of input channels. Default: 3.
+        num_classes (int): number of classes for classification head. Default
+            to 80.
+        embed_dims (List[int]): embedding dimensions.
+            Default to [64, 128, 320, 512].
+        head_dim (int): dimension of attention heads
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+        qkv_bias (bool, optional): if True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        representation_size (Optional[int]): enable and set representation
+            layer (pre-logits) to this value if set
+        drop_rate (float): dropout rate. Default: 0.
+        attn_drop_rate (float): attention dropout rate. Default: 0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.
+        norm_layer (nn.Module): normalization layer
+        use_checkpoint (bool): whether use torch.utils.checkpoint
+        checkpoint_num (list): index for using checkpoint in every stage
+        use_windows (bool): whether use window MHRA
+        use_hybrid (bool): whether use hybrid MHRA
+        window_size (int): size of window (>14). Default: 14.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        depths: List[int] = [3, 4, 8, 3],
+        img_size: int = 224,
+        in_channels: int = 3,
+        num_classes: int = 80,
+        embed_dims: List[int] = [64, 128, 320, 512],
+        head_dim: int = 64,
+        mlp_ratio: int = 4.,
+        qkv_bias: bool = True,
+        qk_scale: float = None,
+        representation_size: Optional[int] = None,
+        drop_rate: float = 0.,
+        attn_drop_rate: float = 0.,
+        drop_path_rate: float = 0.,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint: bool = False,
+        checkpoint_num=(0, 0, 0, 0),
+        use_window: bool = False,
+        use_hybrid: bool = False,
+        window_size: int = 14,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+        ]
+    ) -> None:
+        super(UniFormer, self).__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        self.use_window = use_window
+        self.logger = get_root_logger()
+        self.logger.info(f'Use torch.utils.checkpoint: {self.use_checkpoint}')
+        self.logger.info(
+            f'torch.utils.checkpoint number: {self.checkpoint_num}')
+        self.num_features = self.embed_dims = embed_dims
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+
+        self.patch_embed1 = PatchEmbed(
+            img_size=img_size,
+            patch_size=4,
+            in_channels=in_channels,
+            embed_dim=embed_dims[0])
+        self.patch_embed2 = PatchEmbed(
+            img_size=img_size // 4,
+            patch_size=2,
+            in_channels=embed_dims[0],
+            embed_dim=embed_dims[1])
+        self.patch_embed3 = PatchEmbed(
+            img_size=img_size // 8,
+            patch_size=2,
+            in_channels=embed_dims[1],
+            embed_dim=embed_dims[2])
+        self.patch_embed4 = PatchEmbed(
+            img_size=img_size // 16,
+            patch_size=2,
+            in_channels=embed_dims[2],
+            embed_dim=embed_dims[3])
+
+        self.drop_after_pos = nn.Dropout(drop_rate)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        num_heads = [dim // head_dim for dim in embed_dims]
+        self.blocks1 = nn.ModuleList([
+            CBlock(
+                embed_dim=embed_dims[0],
+                mlp_ratio=mlp_ratio,
+                drop_rate=drop_rate,
+                drop_path_rate=dpr[i]) for i in range(depths[0])
+        ])
+        self.norm1 = norm_layer(embed_dims[0])
+        self.blocks2 = nn.ModuleList([
+            CBlock(
+                embed_dim=embed_dims[1],
+                mlp_ratio=mlp_ratio,
+                drop_rate=drop_rate,
+                drop_path_rate=dpr[i + depths[0]]) for i in range(depths[1])
+        ])
+        self.norm2 = norm_layer(embed_dims[1])
+        if self.use_window:
+            self.logger.info('Use local window for all blocks in stage3')
+            self.blocks3 = nn.ModuleList([
+                WindowSABlock(
+                    embed_dim=embed_dims[2],
+                    num_heads=num_heads[2],
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[i + depths[0] + depths[1]])
+                for i in range(depths[2])
+            ])
+        elif use_hybrid:
+            self.logger.info('Use hybrid window for blocks in stage3')
+            block3 = []
+            for i in range(depths[2]):
+                if (i + 1) % 4 == 0:
+                    block3.append(
+                        SABlock(
+                            embed_dim=embed_dims[2],
+                            num_heads=num_heads[2],
+                            mlp_ratio=mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            qk_scale=qk_scale,
+                            drop_rate=drop_rate,
+                            attn_drop_rate=attn_drop_rate,
+                            drop_path_rate=dpr[i + depths[0] + depths[1]]))
+                else:
+                    block3.append(
+                        WindowSABlock(
+                            embed_dim=embed_dims[2],
+                            num_heads=num_heads[2],
+                            window_size=window_size,
+                            mlp_ratio=mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            qk_scale=qk_scale,
+                            drop_rate=drop_rate,
+                            attn_drop_rate=attn_drop_rate,
+                            drop_path_rate=dpr[i + depths[0] + depths[1]]))
+            self.blocks3 = nn.ModuleList(block3)
+        else:
+            self.logger.info('Use global window for all blocks in stage3')
+            self.blocks3 = nn.ModuleList([
+                SABlock(
+                    embed_dim=embed_dims[2],
+                    num_heads=num_heads[2],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[i + depths[0] + depths[1]])
+                for i in range(depths[2])
+            ])
+        self.norm3 = norm_layer(embed_dims[2])
+        self.blocks4 = nn.ModuleList([
+            SABlock(
+                embed_dim=embed_dims[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[i + depths[0] + depths[1] + depths[2]])
+            for i in range(depths[3])
+        ])
+        self.norm4 = norm_layer(embed_dims[3])
+
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dims,
+                                              representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = nn.Identity()
+
+        self.apply(self._init_weights)
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            pretrained = self.init_cfg['checkpoint']
+            load_checkpoint(
+                self,
+                pretrained,
+                map_location='cpu',
+                strict=False,
+                logger=self.logger)
+            self.logger.info(f'Load pretrained model from {pretrained}')
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dims,
+            num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward(self, x):
+        out = []
+        x = self.patch_embed1(x)
+        x = self.drop_after_pos(x)
+        for i, blk in enumerate(self.blocks1):
+            if self.use_checkpoint and i < self.checkpoint_num[0]:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x_out = self.norm1(x.permute(0, 2, 3, 1))
+        out.append(x_out.permute(0, 3, 1, 2).contiguous())
+        x = self.patch_embed2(x)
+        for i, blk in enumerate(self.blocks2):
+            if self.use_checkpoint and i < self.checkpoint_num[1]:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x_out = self.norm2(x.permute(0, 2, 3, 1))
+        out.append(x_out.permute(0, 3, 1, 2).contiguous())
+        x = self.patch_embed3(x)
+        for i, blk in enumerate(self.blocks3):
+            if self.use_checkpoint and i < self.checkpoint_num[2]:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x_out = self.norm3(x.permute(0, 2, 3, 1))
+        out.append(x_out.permute(0, 3, 1, 2).contiguous())
+        x = self.patch_embed4(x)
+        for i, blk in enumerate(self.blocks4):
+            if self.use_checkpoint and i < self.checkpoint_num[3]:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x_out = self.norm4(x.permute(0, 2, 3, 1))
+        out.append(x_out.permute(0, 3, 1, 2).contiguous())
+        return tuple(out)

From 2e700f479aa54ddddd75b3d6c70365684d8f68ac Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Mon, 24 Jul 2023 19:14:48 +0800
Subject: [PATCH 21/37] [Fix] Check the compatibility of inferencer's
 input/output  (#2567)

---
 mmpose/apis/inferencers/pose2d_inferencer.py             | 9 +++++++++
 mmpose/apis/inferencers/pose3d_inferencer.py             | 9 +++++++++
 .../test_apis/test_inferencers/test_pose2d_inferencer.py | 4 ++++
 3 files changed, 22 insertions(+)

diff --git a/mmpose/apis/inferencers/pose2d_inferencer.py b/mmpose/apis/inferencers/pose2d_inferencer.py
index 3f1f20fdc0..99b079d529 100644
--- a/mmpose/apis/inferencers/pose2d_inferencer.py
+++ b/mmpose/apis/inferencers/pose2d_inferencer.py
@@ -307,6 +307,15 @@ def __call__(
         else:
             inputs = self._inputs_to_list(inputs)
 
+        # check the compatibility between inputs/outputs
+        if not self._video_input and len(inputs) > 0:
+            vis_out_dir = visualize_kwargs.get('vis_out_dir', None)
+            if vis_out_dir is not None:
+                _, file_extension = os.path.splitext(vis_out_dir)
+                assert not file_extension, f'the argument `vis_out_dir` ' \
+                    f'should be a folder while the input contains multiple ' \
+                    f'images, but got {vis_out_dir}'
+
         forward_kwargs['bbox_thr'] = preprocess_kwargs.get('bbox_thr', -1)
         inputs = self.preprocess(
             inputs, batch_size=batch_size, **preprocess_kwargs)
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 0ab7d2e64e..472f43bee2 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -392,6 +392,15 @@ def __call__(
         else:
             inputs = self._inputs_to_list(inputs)
 
+        # check the compatibility between inputs/outputs
+        if not self._video_input and len(inputs) > 0:
+            vis_out_dir = visualize_kwargs.get('vis_out_dir', None)
+            if vis_out_dir is not None:
+                _, file_extension = os.path.splitext(vis_out_dir)
+                assert not file_extension, f'the argument `vis_out_dir` ' \
+                    f'should be a folder while the input contains multiple ' \
+                    f'images, but got {vis_out_dir}'
+
         inputs = self.preprocess(
             inputs, batch_size=batch_size, **preprocess_kwargs)
 
diff --git a/tests/test_apis/test_inferencers/test_pose2d_inferencer.py b/tests/test_apis/test_inferencers/test_pose2d_inferencer.py
index b59232efac..be00527ff1 100644
--- a/tests/test_apis/test_inferencers/test_pose2d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_pose2d_inferencer.py
@@ -144,6 +144,10 @@ def test_call(self):
         self.assertSequenceEqual(results1['predictions'][0][0]['keypoints'],
                                  results3['predictions'][3][0]['keypoints'])
 
+        with self.assertRaises(AssertionError):
+            for res in inferencer(inputs, vis_out_dir=f'{tmp_dir}/1.jpg'):
+                pass
+
         # `inputs` is path to a video
         inputs = 'tests/data/posetrack18/videos/000001_mpiinew_test/' \
                  '000001_mpiinew_test.mp4'

From d57b679e13ff1c60d29c8053446ee66dfb5f92a3 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Mon, 24 Jul 2023 19:24:29 +0800
Subject: [PATCH 22/37] [Fix]Fix 3d visualization (#2565)

---
 demo/body3d_pose_lifter_demo.py             |  2 +
 mmpose/visualization/local_visualizer_3d.py | 80 +++++++++++++--------
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index 72e7b93958..b5c19e8916 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -305,6 +305,8 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
             data_sample=pred_3d_data_samples,
             det_data_sample=det_data_sample,
             draw_gt=False,
+            dataset_2d=pose_det_dataset['type'],
+            dataset_3d=pose_lift_dataset['type'],
             show=args.show,
             draw_bbox=True,
             kpt_thr=args.kpt_thr,
diff --git a/mmpose/visualization/local_visualizer_3d.py b/mmpose/visualization/local_visualizer_3d.py
index 7e3462ce79..99d8086a1e 100644
--- a/mmpose/visualization/local_visualizer_3d.py
+++ b/mmpose/visualization/local_visualizer_3d.py
@@ -9,6 +9,7 @@
 from mmengine.dist import master_only
 from mmengine.structures import InstanceData
 
+from mmpose.apis import convert_keypoint_definition
 from mmpose.registry import VISUALIZERS
 from mmpose.structures import PoseDataSample
 from . import PoseLocalVisualizer
@@ -74,18 +75,17 @@ def __init__(
         self.det_dataset_skeleton = det_dataset_skeleton
         self.det_dataset_link_color = det_dataset_link_color
 
-    def _draw_3d_data_samples(
-        self,
-        image: np.ndarray,
-        pose_samples: PoseDataSample,
-        draw_gt: bool = True,
-        kpt_thr: float = 0.3,
-        num_instances=-1,
-        axis_azimuth: float = 70,
-        axis_limit: float = 1.7,
-        axis_dist: float = 10.0,
-        axis_elev: float = 15.0,
-    ):
+    def _draw_3d_data_samples(self,
+                              image: np.ndarray,
+                              pose_samples: PoseDataSample,
+                              draw_gt: bool = True,
+                              kpt_thr: float = 0.3,
+                              num_instances=-1,
+                              axis_azimuth: float = 70,
+                              axis_limit: float = 1.7,
+                              axis_dist: float = 10.0,
+                              axis_elev: float = 15.0,
+                              scores_2d: Optional[np.ndarray] = None):
         """Draw keypoints and skeletons (optional) of GT or prediction.
 
         Args:
@@ -109,6 +109,8 @@ def _draw_3d_data_samples(
                 - y: [y_c - axis_limit/2, y_c + axis_limit/2]
                 - z: [0, axis_limit]
                 Where x_c, y_c is the mean value of x and y coordinates
+            scores_2d (np.ndarray, optional): Keypoint scores of 2d estimation
+                that will be used to filter 3d instances.
 
         Returns:
             Tuple(np.ndarray): the drawn image which channel is RGB.
@@ -145,20 +147,21 @@ def _draw_3d_data_samples(
 
         def _draw_3d_instances_kpts(keypoints,
                                     scores,
+                                    scores_2d,
                                     keypoints_visible,
                                     fig_idx,
                                     title=None):
 
-            for idx, (kpts, score, visible) in enumerate(
-                    zip(keypoints, scores, keypoints_visible)):
+            for idx, (kpts, score, score_2d) in enumerate(
+                    zip(keypoints, scores, scores_2d)):
 
-                valid = np.logical_and(score >= kpt_thr,
+                valid = np.logical_and(score >= kpt_thr, score_2d >= kpt_thr,
                                        np.any(~np.isnan(kpts), axis=-1))
 
+                kpts_valid = kpts[valid]
                 ax = fig.add_subplot(
                     1, num_fig, fig_idx * (idx + 1), projection='3d')
                 ax.view_init(elev=axis_elev, azim=axis_azimuth)
-                ax.set_zlim3d([0, axis_limit])
                 ax.set_aspect('auto')
                 ax.set_xticks([])
                 ax.set_yticks([])
@@ -171,13 +174,14 @@ def _draw_3d_instances_kpts(keypoints,
                     ax.set_title(f'{title} ({idx})')
                 ax.dist = axis_dist
 
-                x_c = np.mean(kpts[valid, 0]) if valid.any() else 0
-                y_c = np.mean(kpts[valid, 1]) if valid.any() else 0
+                x_c = np.mean(kpts_valid[:, 0]) if valid.any() else 0
+                y_c = np.mean(kpts_valid[:, 1]) if valid.any() else 0
+                z_c = np.mean(kpts_valid[:, 2]) if valid.any() else 0
 
                 ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2])
                 ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2])
-
-                kpts = np.array(kpts, copy=False)
+                ax.set_zlim3d(
+                    [min(0, z_c - axis_limit / 2), z_c + axis_limit / 2])
 
                 if self.kpt_color is None or isinstance(self.kpt_color, str):
                     kpt_color = [self.kpt_color] * len(kpts)
@@ -189,8 +193,7 @@ def _draw_3d_instances_kpts(keypoints,
                         f'({len(self.kpt_color)}) does not matches '
                         f'that of keypoints ({len(kpts)})')
 
-                kpts = kpts[valid]
-                x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1)
+                x_3d, y_3d, z_3d = np.split(kpts_valid[:, :3], [1, 2], axis=1)
 
                 kpt_color = kpt_color[valid][..., ::-1] / 255.
 
@@ -218,7 +221,9 @@ def _draw_3d_instances_kpts(keypoints,
                         ys_3d = kpts[sk_indices, 1]
                         zs_3d = kpts[sk_indices, 2]
                         kpt_score = score[sk_indices]
-                        if kpt_score.min() > kpt_thr:
+                        kpt_score_2d = score_2d[sk_indices]
+                        if kpt_score.min() > kpt_thr and kpt_score_2d.min(
+                        ) > kpt_thr:
                             # matplotlib uses RGB color in [0, 1] value range
                             _color = link_color[sk_id][::-1] / 255.
                             ax.plot(
@@ -233,13 +238,16 @@ def _draw_3d_instances_kpts(keypoints,
             else:
                 scores = np.ones(keypoints.shape[:-1])
 
+            if scores_2d is None:
+                scores_2d = np.ones(keypoints.shape[:-1])
+
             if 'keypoints_visible' in pred_instances:
                 keypoints_visible = pred_instances.keypoints_visible
             else:
                 keypoints_visible = np.ones(keypoints.shape[:-1])
 
-            _draw_3d_instances_kpts(keypoints, scores, keypoints_visible, 1,
-                                    'Prediction')
+            _draw_3d_instances_kpts(keypoints, scores, scores_2d,
+                                    keypoints_visible, 1, 'Prediction')
 
         if draw_gt and 'gt_instances' in pose_samples:
             gt_instances = pose_samples.gt_instances
@@ -300,6 +308,7 @@ def _draw_instances_kpts(self,
 
         self.set_image(image)
         img_h, img_w, _ = image.shape
+        scores = None
 
         if 'keypoints' in instances:
             keypoints = instances.get('transformed_keypoints',
@@ -452,7 +461,7 @@ def _draw_instances_kpts(self,
                             self.draw_lines(
                                 X, Y, color, line_widths=self.line_width)
 
-        return self.get_image()
+        return self.get_image(), scores
 
     @master_only
     def add_datasample(self,
@@ -466,6 +475,8 @@ def add_datasample(self,
                        draw_bbox: bool = False,
                        show_kpt_idx: bool = False,
                        skeleton_style: str = 'mmpose',
+                       dataset_2d: str = 'CocoDataset',
+                       dataset_3d: str = 'Human36mDataset',
                        num_instances: int = -1,
                        show: bool = False,
                        wait_time: float = 0,
@@ -502,6 +513,10 @@ def add_datasample(self,
                 Defaults to ``False``
             skeleton_style (str): Skeleton style selection. Defaults to
                 ``'mmpose'``
+            dataset_2d (str): Name of 2d keypoint dataset. Defaults to
+                ``'CocoDataset'``
+            dataset_3d (str): Name of 3d keypoint dataset. Defaults to
+                ``'Human36mDataset'``
             num_instances (int): Number of instances to be shown in 3D. If
                 smaller than 0, all the instances in the pose_result will be
                 shown. Otherwise, pad or truncate the pose_result to a length
@@ -517,24 +532,31 @@ def add_datasample(self,
 
         det_img_data = None
         gt_img_data = None
+        scores_2d = None
 
         if draw_2d:
             det_img_data = image.copy()
 
             # draw bboxes & keypoints
             if 'pred_instances' in det_data_sample:
-                det_img_data = self._draw_instances_kpts(
+                det_img_data, scores_2d = self._draw_instances_kpts(
                     det_img_data, det_data_sample.pred_instances, kpt_thr,
                     show_kpt_idx, skeleton_style)
                 if draw_bbox:
                     det_img_data = self._draw_instances_bbox(
                         det_img_data, det_data_sample.pred_instances)
-
+        if scores_2d is not None:
+            if scores_2d.ndim == 2:
+                scores_2d = scores_2d[..., None]
+            scores_2d = np.squeeze(
+                convert_keypoint_definition(scores_2d, dataset_2d, dataset_3d),
+                axis=-1)
         pred_img_data = self._draw_3d_data_samples(
             image.copy(),
             data_sample,
             draw_gt=draw_gt,
-            num_instances=num_instances)
+            num_instances=num_instances,
+            scores_2d=scores_2d)
 
         # merge visualization results
         if det_img_data is not None and gt_img_data is not None:

From e3c5c558faad7abb1d06fc9cc549c58771ab6330 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Mon, 24 Jul 2023 21:03:54 +0800
Subject: [PATCH 23/37] [Feature] Add bear example in just dance (#2568)

---
 model-index.yml                      |  1 +
 projects/just_dance/app.py           | 12 ++++-
 projects/just_dance/process_video.py | 68 ++++++++++++++++++++--------
 3 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/model-index.yml b/model-index.yml
index 498e5bc743..71c752b5f2 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -16,6 +16,7 @@ Import:
 - configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml
 - configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml
 - configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml
+- configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml
 - configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml
 - configs/body_2d_keypoint/simcc/coco/resnet_coco.yml
 - configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml
diff --git a/projects/just_dance/app.py b/projects/just_dance/app.py
index 9b40c64fdd..6213ed3663 100644
--- a/projects/just_dance/app.py
+++ b/projects/just_dance/app.py
@@ -57,6 +57,12 @@ def process_video(
 os.system(
     f'wget -O {project_path}/resources/tsinghua_30fps.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/tsinghua_30fps.mp4'  # noqa
 )
+os.system(
+    f'wget -O {project_path}/resources/student1.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/student1.mp4'  # noqa
+)
+os.system(
+    f'wget -O {project_path}/resources/bear_teacher.mp4 https://download.openmmlab.com/mmpose/v1/projects/just_dance/bear_teacher.mp4'  # noqa
+)
 
 with gr.Blocks() as demo:
     with gr.Tab('Upload-Video'):
@@ -66,13 +72,15 @@ def process_video(
                 student_video = gr.Video(type='mp4')
                 gr.Examples([
                     os.path.join(project_path, 'resources/tom.mp4'),
-                    os.path.join(project_path, 'resources/tsinghua_30fps.mp4')
+                    os.path.join(project_path, 'resources/tsinghua_30fps.mp4'),
+                    os.path.join(project_path, 'resources/student1.mp4')
                 ], student_video)
             with gr.Column():
                 gr.Markdown('Teacher Video')
                 teacher_video = gr.Video(type='mp4')
                 gr.Examples([
-                    os.path.join(project_path, 'resources/idol_producer.mp4')
+                    os.path.join(project_path, 'resources/idol_producer.mp4'),
+                    os.path.join(project_path, 'resources/bear_teacher.mp4')
                 ], teacher_video)
 
         button = gr.Button('Grading', variant='primary')
diff --git a/projects/just_dance/process_video.py b/projects/just_dance/process_video.py
index 7f1d48b922..32326f36bd 100644
--- a/projects/just_dance/process_video.py
+++ b/projects/just_dance/process_video.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import tempfile
+from typing import Optional
 
 import cv2
 import mmcv
@@ -24,32 +25,50 @@
     from utils import (blend_images, convert_video_fps, get_smoothed_kpt,
                        resize_image_to_fixed_height)
 
-det_config = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)),
-    'configs/rtmdet-nano_one-person.py')
-det_weights = 'https://download.openmmlab.com/mmpose/v1/projects/' \
-    'rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth'
+model_cfg = dict(
+    human=dict(
+        model='rtmpose-t_8xb256-420e_aic-coco-256x192',
+        det_model=os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            'configs/rtmdet-nano_one-person.py'),
+        det_weights='https://download.openmmlab.com/mmpose/v1/projects/'
+        'rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth',
+    ),
+    bear=dict(
+        model='rtmpose-l_8xb256-420e_humanart-256x192',
+        det_model='rtmdet-m',
+        det_cat_ids=77,
+    ),
+)
 
 
 class VideoProcessor:
     """A class to process videos for pose estimation and visualization."""
 
+    def __init__(self):
+        self.category = 'human'
+
+    def _set_category(self, category):
+        assert category in model_cfg
+        self.category = category
+
     @property
     def pose_estimator(self) -> Pose2DInferencer:
         if not hasattr(self, '_pose_estimator'):
-            self._pose_estimator = Pose2DInferencer(
-                'rtmpose-t_8xb256-420e_aic-coco-256x192',
-                det_model=det_config,
-                det_weights=det_weights)
-            self._pose_estimator.model.test_cfg['flip_test'] = False
-        return self._pose_estimator
+            self._pose_estimator = dict()
+        if self.category not in self._pose_estimator:
+            self._pose_estimator[self.category] = Pose2DInferencer(
+                **(model_cfg[self.category]))
+            self._pose_estimator[
+                self.category].model.test_cfg['flip_test'] = False
+        return self._pose_estimator[self.category]
 
     @property
     def visualizer(self) -> PoseLocalVisualizer:
         if hasattr(self, '_visualizer'):
             return self._visualizer
         elif hasattr(self, '_pose_estimator'):
-            return self._pose_estimator.visualizer
+            return self.pose_estimator.visualizer
 
         # init visualizer
         self._visualizer = PoseLocalVisualizer()
@@ -109,11 +128,16 @@ def get_keypoints_from_video(self, video: str) -> np.ndarray:
 
         video_reader = mmcv.VideoReader(video)
 
-        if video_reader.fps != 30:
+        if abs(video_reader.fps - 30) > 0.1:
             video_reader = mmcv.VideoReader(convert_video_fps(video))
 
-        assert video_reader.fps == 30, f'only support videos with 30 FPS, ' \
-            f'but the video {video_fname} has {video_reader.fps} fps'
+        assert abs(video_reader.fps - 30) < 0.1, f'only support videos with ' \
+            f'30 FPS, but the video {video_fname} has {video_reader.fps} fps'
+
+        if os.path.basename(video_fname).startswith('bear'):
+            self._set_category('bear')
+        else:
+            self._set_category('human')
         keypoints_list = []
         for i, frame in enumerate(video_reader):
             keypoints = self.get_keypoints_from_frame(frame)
@@ -123,7 +147,10 @@ def get_keypoints_from_video(self, video: str) -> np.ndarray:
         return keypoints
 
     @torch.no_grad()
-    def run(self, tch_video: str, stu_video: str):
+    def run(self,
+            tch_video: str,
+            stu_video: str,
+            output_file: Optional[str] = None):
         # extract human poses
         tch_kpts = self.get_keypoints_from_video(tch_video)
         stu_kpts = self.get_keypoints_from_video(stu_video)
@@ -137,8 +164,9 @@ def run(self, tch_video: str, stu_video: str):
         # output
         tch_name = os.path.basename(tch_video).rsplit('.', 1)[0]
         stu_name = os.path.basename(stu_video).rsplit('.', 1)[0]
-        fname = f'{tch_name}-{stu_name}.mp4'
-        output_file = os.path.join(tempfile.mkdtemp(), fname)
+        if output_file is None:
+            fname = f'{tch_name}-{stu_name}.mp4'
+            output_file = os.path.join(tempfile.mkdtemp(), fname)
         return self.generate_output_video(tch_video, stu_video, output_file,
                                           tch_kpts, stu_kpts, piece_info)
 
@@ -223,7 +251,9 @@ def generate_output_video(self, tch_video: str, stu_video: str,
     parser = ArgumentParser()
     parser.add_argument('teacher_video', help='Path to the Teacher Video')
     parser.add_argument('student_video', help='Path to the Student Video')
+    parser.add_argument(
+        '--output-file', help='Path to save the output Video', default=None)
     args = parser.parse_args()
 
     processor = VideoProcessor()
-    processor.run(args.teacher_video, args.student_video)
+    processor.run(args.teacher_video, args.student_video, args.output_file)

From 3a9f1f3d3bc6e8239ab157ed405a271453fbaf1e Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Tue, 25 Jul 2023 12:48:29 +0800
Subject: [PATCH 24/37] [Doc] Add example and openxlab link for just dance
 (#2571)

---
 projects/README.md            |  6 +++++-
 projects/just_dance/README.md | 12 ++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/projects/README.md b/projects/README.md
index 1089af8194..f6f38bb15c 100644
--- a/projects/README.md
+++ b/projects/README.md
@@ -54,7 +54,11 @@ We also provide some documentation listed below to help you get started:
   <img src="https://user-images.githubusercontent.com/13503330/231416285-5467d313-0732-4ada-97e1-12be6ec69a28.png" width="800"/>
   </div><br/>
 
-- **[💃Just-Dance](./just_dance)**: Enhancing Dance scoring system for comparing dance performances in videos
+- **[💃Just-Dance](./just_dance)**: Enhancing Dance scoring system for comparing dance performances in videos. <sup>
+  <a href="https://openxlab.org.cn/apps/detail/mmpose/just_dance-mmpose">
+  <i>TRY IT NOW</i>
+  </a>
+  </sup>
 
   <div align=center>
   <img src="https://github.com/open-mmlab/mmpose/assets/26127467/a80978ad-c66d-4bac-bf56-1fa191716f1c" width="800" style="width: 800px; height: 200px; object-fit: cover"/>
diff --git a/projects/just_dance/README.md b/projects/just_dance/README.md
index 70390215f9..385ef03005 100644
--- a/projects/just_dance/README.md
+++ b/projects/just_dance/README.md
@@ -1,10 +1,18 @@
 # Just Dance - A Simple Implementation
 
+<sup>
+   <a href="https://openxlab.org.cn/apps/detail/mmpose/just_dance-mmpose">
+      <i>Try it on OpenXLab</i>
+   </a>
+</sup>
+
 This project presents a dance scoring system based on RTMPose. Users can compare the similarity between two dancers in different videos: one referred to as the "teacher video" and the other as the "student video."
 
-Here is an example of the output dance comparison:
+Here are examples of the output dance comparison:
+
+<img src="https://github.com/open-mmlab/mmpose/assets/26127467/56d5c4d1-55d8-4222-b481-2418cc29a8d4" width="600"/>
 
-![output](https://github.com/open-mmlab/mmpose/assets/26127467/56d5c4d1-55d8-4222-b481-2418cc29a8d4)
+<img src="https://github.com/open-mmlab/mmpose/assets/26127467/f93b94c7-529f-4704-8246-c3c812f4c31a" width="600"/>
 
 ## Usage
 

From e8bf5a7d24c8f1ef7b3516f8c72c8a35229e3976 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Tue, 25 Jul 2023 19:18:01 +0800
Subject: [PATCH 25/37] [Fix] Configs' paths of VideoPose3d (#2572)

---
 ...se-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py} |  0
 .../pose_lift/h36m/videopose3d_h36m.md                 | 10 +++++-----
 .../pose_lift/h36m/videopose3d_h36m.yml                | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py => pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py} (100%)

diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py
similarity index 100%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
index 48502c7b09..8f7c8fba5b 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
@@ -41,15 +41,15 @@ Testing results on Human3.6M dataset with ground truth 2D detections, supervised
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) |       243       | 37.6  |  28.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py) |       243       | 37.6  |  28.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
 
 Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, supervised training
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
 | [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) |       243       | 47.9  |  38.0   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
 
 Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training
@@ -62,6 +62,6 @@ Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, semi-su
 
 | Training Data |                        Arch                         | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                        ckpt                         |                         log                         |
 | :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
-| 10% S1        | [VideoPose3D-semi-supervised-CPN-27frm](/configs/xxx.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+| 10% S1        | [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
 
 <sup>1</sup> CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy).
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml
index 6b9d92c115..2ae0513699 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml
+++ b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.yml
@@ -6,13 +6,13 @@ Collections:
     URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html
   README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/videopose3d.md
 Models:
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m.py
   In Collection: VideoPose3D
   Metadata:
     Architecture: &id001
     - VideoPose3D
     Training Data: Human3.6M
-  Name: pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m
+  Name: pose-lift_videopose3d-27frm-supv_8xb128-160e_h36m
   Results:
   - Dataset: Human3.6M
     Metrics:
@@ -20,7 +20,7 @@ Models:
       P-MPJPE: 30.1
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
   In Collection: VideoPose3D
   Metadata:
     Architecture: *id001
@@ -33,7 +33,7 @@ Models:
       P-MPJPE: 29.2
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36mpy
   In Collection: VideoPose3D
   Metadata:
     Architecture: *id001
@@ -46,7 +46,7 @@ Models:
       P-MPJPE: 28.3
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
   In Collection: VideoPose3D
   Metadata:
     Architecture: *id001

From a802a3d88061feeb123a77f160384c06b12ba576 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Wed, 26 Jul 2023 10:50:47 +0800
Subject: [PATCH 26/37] [Docs] update docs (#2573)

---
 docs/en/advanced_guides/customize_datasets.md |  6 +--
 docs/en/user_guides/inference.md              | 12 +++---
 docs/en/user_guides/mixed_datasets.md         | 14 +++----
 .../advanced_guides/customize_datasets.md     |  4 +-
 docs/zh_cn/user_guides/inference.md           | 40 +++++++++----------
 docs/zh_cn/user_guides/mixed_datasets.md      | 16 ++++----
 6 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/docs/en/advanced_guides/customize_datasets.md b/docs/en/advanced_guides/customize_datasets.md
index 202d23c13c..aec7520a30 100644
--- a/docs/en/advanced_guides/customize_datasets.md
+++ b/docs/en/advanced_guides/customize_datasets.md
@@ -213,7 +213,7 @@ The following dataset wrappers are supported in [MMEngine](https://github.com/op
 
 ### CombinedDataset
 
-MMPose provides `CombinedDataset` to combine multiple datasets with different annotations. A combined dataset can be defined in config files as:
+MMPose provides [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) to combine multiple datasets with different annotations. A combined dataset can be defined in config files as:
 
 ```python
 dataset_1 = dict(
@@ -250,6 +250,6 @@ combined_dataset = dict(
 
 - **MetaInfo of combined dataset** determines the annotation format. Either metainfo of a sub-dataset or a customed dataset metainfo is valid here. To custom a dataset metainfo, please refer to [Create a custom dataset_info config file for the dataset](#create-a-custom-datasetinfo-config-file-for-the-dataset).
 
-- **Converter transforms of sub-datasets** are applied when there exist mismatches of annotation format between sub-datasets and the combined dataset. For example, the number and order of keypoints might be different in the combined dataset and the sub-datasets. Then `KeypointConverter` can be used to unify the keypoints number and order.
+- **Converter transforms of sub-datasets** are applied when there exist mismatches of annotation format between sub-datasets and the combined dataset. For example, the number and order of keypoints might be different in the combined dataset and the sub-datasets. Then [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) can be used to unify the keypoints number and order.
 
-- More details about `CombinedDataset` and `KeypointConverter` can be found in Advanced Guides-[Training with Mixed Datasets](../user_guides/mixed_datasets.md).
+- More details about [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) and [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) can be found in [Advanced Guides - Training with Mixed Datasets](../user_guides/mixed_datasets.md).
diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md
index 518b2e89d3..127c52ff74 100644
--- a/docs/en/user_guides/inference.md
+++ b/docs/en/user_guides/inference.md
@@ -9,11 +9,11 @@ In MMPose, a model is defined by a configuration file, while its pre-existing pa
 
 ## Inferencer: a Unified Inference Interface
 
-MMPose offers a comprehensive API for inference, known as `MMPoseInferencer`. This API enables users to perform inference on both images and videos using all the models supported by MMPose. Furthermore, the API provides automatic visualization of inference results and allows for the convenient saving of predictions.
+MMPose offers a comprehensive API for inference, known as [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24). This API enables users to perform inference on both images and videos using all the models supported by MMPose. Furthermore, the API provides automatic visualization of inference results and allows for the convenient saving of predictions.
 
 ### Basic Usage
 
-The `MMPoseInferencer` can be used in any Python program to perform pose estimation. Below is an example of inference on a given image using the pre-trained human pose estimator within the Python shell.
+The [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) can be used in any Python program to perform pose estimation. Below is an example of inference on a given image using the pre-trained human pose estimator within the Python shell.
 
 ```python
 from mmpose.apis import MMPoseInferencer
@@ -80,7 +80,7 @@ python demo/inferencer_demo.py 'tests/data/coco/000000000785.jpg' \
     --pose2d 'human' --show --pred-out-dir 'predictions'
 ```
 
-The predictions will be save in `predictions/000000000785.json`. The argument names correspond with the `MMPoseInferencer`, which serves as an API.
+The predictions will be save in `predictions/000000000785.json`. The argument names correspond with the [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24), which serves as an API.
 
 The inferencer is capable of processing a range of input types, which includes the following:
 
@@ -219,7 +219,7 @@ result = next(result_generator)
 
 ### Arguments of Inferencer
 
-The `MMPoseInferencer` offers a variety of arguments for customizing pose estimation, visualization, and saving predictions. Below is a list of the arguments available when initializing the inferencer and their descriptions:
+The [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) offers a variety of arguments for customizing pose estimation, visualization, and saving predictions. Below is a list of the arguments available when initializing the inferencer and their descriptions:
 
 | Argument         | Description                                                                                                      |
 | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
@@ -233,7 +233,7 @@ The `MMPoseInferencer` offers a variety of arguments for customizing pose estima
 | `device`         | The device to perform the inference. If left `None`, the Inferencer will select the most suitable one.           |
 | `scope`          | The namespace where the model modules are defined.                                                               |
 
-The inferencer is designed for both visualization and saving predictions. The table below presents the list of arguments available when using the `MMPoseInferencer` for inference, along with their compatibility with 2D and 3D inferencing:
+The inferencer is designed for both visualization and saving predictions. The table below presents the list of arguments available when using the [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) for inference, along with their compatibility with 2D and 3D inferencing:
 
 | Argument                  | Description                                                                                                                                                       | 2D  | 3D  |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | --- |
@@ -258,7 +258,7 @@ The inferencer is designed for both visualization and saving predictions. The ta
 
 ### Model Alias
 
-The MMPose library has predefined aliases for several frequently used models. These aliases can be utilized as a shortcut when initializing the `MMPoseInferencer`, as an alternative to providing the full model configuration name. Here are the available 2D model aliases and their corresponding configuration names:
+The MMPose library has predefined aliases for several frequently used models. These aliases can be utilized as a shortcut when initializing the [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24), as an alternative to providing the full model configuration name. Here are the available 2D model aliases and their corresponding configuration names:
 
 | Alias     | Configuration Name                                 | Task                            | Pose Estimator | Detector            |
 | --------- | -------------------------------------------------- | ------------------------------- | -------------- | ------------------- |
diff --git a/docs/en/user_guides/mixed_datasets.md b/docs/en/user_guides/mixed_datasets.md
index 9478ddfd1e..041bd7c656 100644
--- a/docs/en/user_guides/mixed_datasets.md
+++ b/docs/en/user_guides/mixed_datasets.md
@@ -1,10 +1,10 @@
 # Use Mixed Datasets for Training
 
-MMPose offers a convenient and versatile solution for training with mixed datasets through its `CombinedDataset` tool. Acting as a wrapper, it allows for the inclusion of multiple datasets and seamlessly reads and converts data from varying sources into a unified format for model training. The data processing pipeline utilizing `CombinedDataset` is illustrated in the following figure.
+MMPose offers a convenient and versatile solution for training with mixed datasets through its [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) tool. Acting as a wrapper, it allows for the inclusion of multiple datasets and seamlessly reads and converts data from varying sources into a unified format for model training. The data processing pipeline utilizing [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) is illustrated in the following figure.
 
 ![combined_dataset_pipeline](https://user-images.githubusercontent.com/26127467/223333154-fb88e511-810a-423c-b755-c791d296bc43.jpg)
 
-The following section will provide a detailed description of how to configure `CombinedDataset` with an example that combines the COCO and AI Challenger (AIC) datasets.
+The following section will provide a detailed description of how to configure [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) with an example that combines the COCO and AI Challenger (AIC) datasets.
 
 ## COCO & AIC example
 
@@ -39,7 +39,7 @@ dataset_coco = dict(
 )
 ```
 
-For AIC dataset, the order of the keypoints needs to be transformed. MMPose provides a `KeypointConverter` transform to achieve this. Here's an example of how to configure the AIC sub dataset:
+For AIC dataset, the order of the keypoints needs to be transformed. MMPose provides a [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) transform to achieve this. Here's an example of how to configure the AIC sub dataset:
 
 ```python
 dataset_aic = dict(
@@ -70,9 +70,9 @@ dataset_aic = dict(
 )
 ```
 
-By using the `KeypointConverter`, the indices of keypoints with indices 0 to 11 will be transformed to corresponding indices among 5 to 16. Meanwhile, the keypoints with indices 12 and 13 will be removed. For the target keypoints with indices 0 to 4, which are not defined in the `mapping` argument, they will be set as invisible and won't be used in training.
+By using the [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11), the indices of keypoints with indices 0 to 11 will be transformed to corresponding indices among 5 to 16. Meanwhile, the keypoints with indices 12 and 13 will be removed. For the target keypoints with indices 0 to 4, which are not defined in the `mapping` argument, they will be set as invisible and won't be used in training.
 
-Once the sub datasets are configured, the `CombinedDataset` wrapper can be defined as follows:
+Once the sub datasets are configured, the [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) wrapper can be defined as follows:
 
 ```python
 dataset = dict(
@@ -100,7 +100,7 @@ The previously mentioned method discards some annotations in the AIC dataset. If
 
 <img src="https://user-images.githubusercontent.com/26127467/223356617-075e0ab1-0ed3-426d-bc88-4f16be93f0ba.png" height="200px" alt><br>
 
-In this scenario, both COCO and AIC datasets need to adjust the keypoint indices using `KeypointConverter`:
+In this scenario, both COCO and AIC datasets need to adjust the keypoint indices using [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11):
 
 ```python
 dataset_coco = dict(
@@ -172,7 +172,7 @@ When training with mixed datasets, users often encounter the problem of inconsis
 
 ### Adjust the sampling ratio of each sub dataset
 
-In `CombinedDataset`, we provide the `sample_ratio_factor` argument to adjust the sampling ratio of each sub dataset.
+In [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15), we provide the `sample_ratio_factor` argument to adjust the sampling ratio of each sub dataset.
 
 For example:
 
diff --git a/docs/zh_cn/advanced_guides/customize_datasets.md b/docs/zh_cn/advanced_guides/customize_datasets.md
index 2ff16bf9d0..9d1db35ceb 100644
--- a/docs/zh_cn/advanced_guides/customize_datasets.md
+++ b/docs/zh_cn/advanced_guides/customize_datasets.md
@@ -217,14 +217,14 @@ test_dataloader = dict(
 
 ## 数据集封装
 
-目前 [MMEngine](https://github.com/open-mmlab/mmengine) 支持以下数据集封装：
+在 MMPose 中，支持使用 MMPose 实现的数据集封装和 [MMEngine](https://github.com/open-mmlab/mmengine) 实现的数据集封装。目前 [MMEngine](https://github.com/open-mmlab/mmengine) 支持以下数据集封装：
 
 - [ConcatDataset](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/basedataset.html#concatdataset)
 - [RepeatDataset](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/basedataset.html#repeatdataset)
 
 ### CombinedDataset
 
-MMPose 提供了一个 `CombinedDataset` 类，它可以将多个数据集封装成一个数据集。它的使用方法如下：
+MMPose 提供了一个 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) 类，它可以将多个数据集封装成一个数据集。它的使用方法如下：
 
 ```python
 dataset_1 = dict(
diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md
index 4017b32d84..f8efa74dc9 100644
--- a/docs/zh_cn/user_guides/inference.md
+++ b/docs/zh_cn/user_guides/inference.md
@@ -1,18 +1,18 @@
 # 使用现有模型进行推理
 
-MMPose为姿态估计提供了大量可以从[模型库](https://mmpose.readthedocs.io/en/latest/model_zoo.html)中找到的预测训练模型。本指南将演示**如何执行推理**，或使用训练过的模型对提供的图像或视频运行姿态估计。
+MMPose 为姿态估计提供了大量可以从 [模型库](https://mmpose.readthedocs.io/en/latest/model_zoo.html) 中找到的预测训练模型。本指南将演示**如何执行推理**，或使用训练过的模型对提供的图像或视频运行姿态估计。
 
 有关在标准数据集上测试现有模型的说明，请参阅本指南。
 
-在MMPose，模型由配置文件定义，而其已计算好的参数存储在权重文件（checkpoint file）中。您可以在[模型库](https://mmpose.readthedocs.io/en/latest/model_zoo.html)中找到模型配置文件和相应的权重文件的URL。我们建议从使用HRNet模型的[配置文件](https://github.com/open-mmlab/mmpose/blob/main/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py)和[权重文件](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth)开始。
+在 MMPose，模型由配置文件定义，而其已计算好的参数存储在权重文件（checkpoint file）中。您可以在 [模型库](https://mmpose.readthedocs.io/en/latest/model_zoo.html) 中找到模型配置文件和相应的权重文件的 URL。我们建议从使用 HRNet 模型的[配置文件](https://github.com/open-mmlab/mmpose/blob/main/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py)和 [权重文件](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) 开始。
 
 ## 推理器：统一的推理接口
 
-MMPose提供了一个被称为`MMPoseInferencer`的、全面的推理API。这个API使得用户得以使用所有MMPose支持的模型来对图像和视频进行模型推理。此外，该API可以完成推理结果自动化，并方便用户保存预测结果。
+MMPose提供了一个被称为 [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 的、全面的推理 API。这个 API 使得用户得以使用所有 MMPose 支持的模型来对图像和视频进行模型推理。此外，该API可以完成推理结果自动化，并方便用户保存预测结果。
 
 ### 基本用法
 
-`MMPoseInferencer`可以在任何Python程序中被用来执行姿态估计任务。以下是在一个在Python Shell中使用预训练的人体姿态模型对给定图像进行推理的示例。
+[MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 可以在任何 Python 程序中被用来执行姿态估计任务。以下是在一个在 Python Shell 中使用预训练的人体姿态模型对给定图像进行推理的示例。
 
 ```python
 from mmpose.apis import MMPoseInferencer
@@ -75,7 +75,7 @@ python demo/inferencer_demo.py 'tests/data/coco/000000000785.jpg' \
     --pose2d 'human' --show --pred-out-dir 'predictions'
 ```
 
-预测结果将被保存在路径`predictions/000000000785.json`。作为一个API，`inferencer_demo.py`的输入参数与`MMPoseInferencer`的相同。前者能够处理一系列输入类型，包括以下内容：
+预测结果将被保存在路径 `predictions/000000000785.json` 。作为一个API，`inferencer_demo.py` 的输入参数与 [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 的相同。前者能够处理一系列输入类型，包括以下内容：
 
 - 图像路径
 
@@ -87,7 +87,7 @@ python demo/inferencer_demo.py 'tests/data/coco/000000000785.jpg' \
 
 - 表示图像的 numpy array 列表 (在命令行界面工具中未支持)
 
-- 摄像头（在这种情况下，输入参数应该设置为`webcam`或`webcam:{CAMERA_ID}`）
+- 摄像头（在这种情况下，输入参数应该设置为 `webcam` 或 `webcam:{CAMERA_ID}`）
 
 当输入对应于多个图像时，例如输入为**视频**或**文件夹**路径时，推理生成器必须被遍历，以便推理器对视频/文件夹中的所有帧/图像进行推理。以下是一个示例：
 
@@ -102,7 +102,7 @@ results = [result for result in result_generator]
 
 ### 自定义姿态估计模型
 
-`MMPoseInferencer`提供了几种可用于自定义所使用的模型的方法：
+[MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 提供了几种可用于自定义所使用的模型的方法：
 
 ```python
 # 使用模型别名构建推断器
@@ -122,7 +122,7 @@ inferencer = MMPoseInferencer(
 
 模型别名的完整列表可以在模型别名部分中找到。
 
-此外，自顶向下的姿态估计器还需要一个对象检测模型。`MMPoseInferencer`能够推断用MMPose支持的数据集训练的模型的实例类型，然后构建必要的对象检测模型。用户也可以通过以下方式手动指定检测模型:
+此外，自顶向下的姿态估计器还需要一个对象检测模型。[MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 能够推断用 MMPose 支持的数据集训练的模型的实例类型，然后构建必要的对象检测模型。用户也可以通过以下方式手动指定检测模型:
 
 ```python
 # 通过别名指定检测模型
@@ -157,29 +157,29 @@ inferencer = MMPoseInferencer(
 
 在执行姿态估计推理任务之后，您可能希望保存结果以供进一步分析或处理。本节将指导您将预测的关键点和可视化结果保存到本地。
 
-要将预测保存在<mark>JSON文件</mark>中，在运行`MMPoseInferencer`的实例`inferencer`时使用`pred_out_dir`参数:
+要将预测保存在<mark>JSON文件</mark>中，在运行 [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 的实例 `inferencer` 时使用 `pred_out_dir` 参数:
 
 ```python
 result_generator = inferencer(img_path, pred_out_dir='predictions')
 result = next(result_generator)
 ```
 
-预测结果将以JSON格式保存在`predictions/`文件夹中，每个文件以相应的输入图像或视频的名称命名。
+预测结果将以 JSON 格式保存在 `predictions/` 文件夹中，每个文件以相应的输入图像或视频的名称命名。
 
-对于更高级的场景，还可以直接从`inferencer`返回的`result`字典中访问预测结果。其中，`predictions`包含输入图像或视频中每个单独实例的预测关键点列表。然后，您可以使用您喜欢的方法操作或存储这些结果。
+对于更高级的场景，还可以直接从 `inferencer` 返回的 `result` 字典中访问预测结果。其中，`predictions` 包含输入图像或视频中每个单独实例的预测关键点列表。然后，您可以使用您喜欢的方法操作或存储这些结果。
 
-请记住，如果你想将<mark>可视化图像</mark>和预测文件保存在一个文件夹中，你可以使用`out_dir`参数：
+请记住，如果你想将<mark>可视化图像</mark>和预测文件保存在一个文件夹中，你可以使用 `out_dir` 参数：
 
 ```python
 result_generator = inferencer(img_path, out_dir='output')
 result = next(result_generator)
 ```
 
-在这种情况下，可视化图像将保存在`output/visualization/`文件夹中，而预测将存储在`output/forecasts/`文件夹中。
+在这种情况下，可视化图像将保存在 `output/visualization/` 文件夹中，而预测将存储在 `output/forecasts/` 文件夹中。
 
 ### 可视化
 
-推理器`inferencer`可以自动对输入的图像或视频进行预测。可视化结果可以显示在一个新的窗口中，并保存在本地。
+推理器 `inferencer` 可以自动对输入的图像或视频进行预测。可视化结果可以显示在一个新的窗口中，并保存在本地。
 
 要在新窗口中查看可视化结果，请使用以下代码：
 
@@ -187,7 +187,7 @@ result = next(result_generator)
 
 - 如果输入视频来自网络摄像头，默认情况下将在新窗口中显示可视化结果，以此让用户看到输入
 
-- 如果平台上没有GUI，这个步骤可能会卡住
+- 如果平台上没有 GUI，这个步骤可能会卡住
 
 要将可视化结果保存在本地，可以像这样指定`vis_out_dir`参数:
 
@@ -196,9 +196,9 @@ result_generator = inferencer(img_path, vis_out_dir='vis_results')
 result = next(result_generator)
 ```
 
-输入图片或视频的可视化预测结果将保存在`vis_results/`文件夹中
+输入图片或视频的可视化预测结果将保存在 `vis_results/` 文件夹中
 
-在开头展示的滑雪图中，姿态的可视化估计结果由关键点（用实心圆描绘）和骨架（用线条表示）组成。这些视觉元素的默认大小可能不会产生令人满意的结果。用户可以使用`radius`和`thickness`参数来调整圆的大小和线的粗细，如下所示：
+在开头展示的滑雪图中，姿态的可视化估计结果由关键点（用实心圆描绘）和骨架（用线条表示）组成。这些视觉元素的默认大小可能不会产生令人满意的结果。用户可以使用 `radius` 和 `thickness` 参数来调整圆的大小和线的粗细，如下所示：
 
 ```python
 result_generator = inferencer(img_path, show=True, radius=4, thickness=2)
@@ -207,7 +207,7 @@ result = next(result_generator)
 
 ### 推理器参数
 
-`MMPoseInferencer`提供了各种自定义姿态估计、可视化和保存预测结果的参数。下面是<mark>初始化</mark>推断器时可用的参数列表及对这些参数的描述：
+[MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 提供了各种自定义姿态估计、可视化和保存预测结果的参数。下面是<mark>初始化</mark>推断器时可用的参数列表及对这些参数的描述：
 
 | Argument         | Description                                                  |
 | ---------------- | ------------------------------------------------------------ |
@@ -221,7 +221,7 @@ result = next(result_generator)
 | `device`         | 执行推理的设备。如果为 `None`，推理器将选择最合适的一个。    |
 | `scope`          | 定义模型模块的名称空间                                       |
 
-推理器被设计用于可视化和保存预测。以下表格列出了在使用 `MMPoseInferencer` <mark>进行推断</mark>时可用的参数列表，以及它们与 2D 和 3D 推理器的兼容性：
+推理器被设计用于可视化和保存预测。以下表格列出了在使用 [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) <mark>进行推断</mark>时可用的参数列表，以及它们与 2D 和 3D 推理器的兼容性：
 
 | 参数                      | 描述                                                                                                                       | 2D  | 3D  |
 | ------------------------- | -------------------------------------------------------------------------------------------------------------------------- | --- | --- |
@@ -246,7 +246,7 @@ result = next(result_generator)
 
 ### 模型别名
 
-MMPose为常用模型提供了一组预定义的别名。在初始化 `MMPoseInferencer` 时，这些别名可以用作简略的表达方式，而不是指定完整的模型配置名称。下面是可用的模型别名及其对应的配置名称的列表：
+MMPose 为常用模型提供了一组预定义的别名。在初始化 [MMPoseInferencer](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/apis/inferencers/mmpose_inferencer.py#L24) 时，这些别名可以用作简略的表达方式，而不是指定完整的模型配置名称。下面是可用的模型别名及其对应的配置名称的列表：
 
 | 别名      | 配置文件名称                                       | 对应任务                        | 姿态估计模型  | 检测模型            |
 | --------- | -------------------------------------------------- | ------------------------------- | ------------- | ------------------- |
diff --git a/docs/zh_cn/user_guides/mixed_datasets.md b/docs/zh_cn/user_guides/mixed_datasets.md
index d62ace5f5d..6839da3b3d 100644
--- a/docs/zh_cn/user_guides/mixed_datasets.md
+++ b/docs/zh_cn/user_guides/mixed_datasets.md
@@ -1,10 +1,10 @@
 # 混合数据集训练
 
-MMPose 提供了一个灵活、便捷的工具 `CombinedDataset` 来进行混合数据集训练。它作为一个封装器，可以包含多个子数据集，并将来自不同子数据集的数据转换成一个统一的格式，以用于模型训练。使用 `CombinedDataset` 的数据处理流程如下图所示。
+MMPose 提供了一个灵活、便捷的工具 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) 来进行混合数据集训练。它作为一个封装器，可以包含多个子数据集，并将来自不同子数据集的数据转换成一个统一的格式，以用于模型训练。使用 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) 的数据处理流程如下图所示。
 
 ![combined_dataset_pipeline](https://user-images.githubusercontent.com/26127467/223333154-fb88e511-810a-423c-b755-c791d296bc43.jpg)
 
-本篇教程的后续部分将通过一个结合 COCO 和 AI Challenger (AIC) 数据集的例子详细介绍如何配置 `CombinedDataset`。
+本篇教程的后续部分将通过一个结合 COCO 和 AI Challenger (AIC) 数据集的例子详细介绍如何配置 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15)。
 
 ## COCO & AIC 数据集混合案例
 
@@ -39,7 +39,7 @@ dataset_coco = dict(
 )
 ```
 
-对于 AIC 数据集，需要转换关键点的顺序。MMPose 提供了一个 `KeypointConverter` 转换器来实现这一点。以下是配置 AIC 子数据集的示例：
+对于 AIC 数据集，需要转换关键点的顺序。MMPose 提供了一个 [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) 转换器来实现这一点。以下是配置 AIC 子数据集的示例：
 
 ```python
 dataset_aic = dict(
@@ -70,9 +70,9 @@ dataset_aic = dict(
 )
 ```
 
-`KeypointConverter` 会将原序号在 0 到 11 之间的关键点的序号转换为在 5 到 16 之间的对应序号。同时，在 AIC 中序号为为 12 和 13 的关键点将被删除。另外，目标序号在 0 到 4 之间的关键点在 `mapping` 参数中没有定义，这些点将被设为不可见，并且不会在训练中使用。
+[KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) 会将原序号在 0 到 11 之间的关键点的序号转换为在 5 到 16 之间的对应序号。同时，在 AIC 中序号为为 12 和 13 的关键点将被删除。另外，目标序号在 0 到 4 之间的关键点在 `mapping` 参数中没有定义，这些点将被设为不可见，并且不会在训练中使用。
 
-子数据集都完成配置后, 混合数据集 `CombinedDataset` 可以通过如下方式配置:
+子数据集都完成配置后, 混合数据集 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) 可以通过如下方式配置:
 
 ```python
 dataset = dict(
@@ -98,7 +98,7 @@ MMPose 提供了一份完整的 [配置文件](https://github.com/open-mmlab/mmp
 
 <img src="https://user-images.githubusercontent.com/26127467/223356617-075e0ab1-0ed3-426d-bc88-4f16be93f0ba.png" height="200px" alt><br>
 
-在这种情况下，COCO 和 AIC 数据集都需要使用 `KeypointConverter` 来调整它们关键点的顺序：
+在这种情况下，COCO 和 AIC 数据集都需要使用 [KeypointConverter](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/transforms/converting.py#L11) 来调整它们关键点的顺序：
 
 ```python
 dataset_coco = dict(
@@ -145,7 +145,7 @@ dataset_aic = dict(
 - 在 `skeleton_info` 中添加了“头顶”和“颈部”间的连线；
 - 拓展 `joint_weights` 和 `sigmas` 以添加新增关键点的信息。
 
-完成以上步骤后，合并数据集 `CombinedDataset` 可以通过以下方式配置：
+完成以上步骤后，合并数据集 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15) 可以通过以下方式配置：
 
 ```python
 dataset = dict(
@@ -170,7 +170,7 @@ dataset = dict(
 
 ### 调整每个子数据集的采样比例
 
-在 `CombinedDataset` 中，我们提供了 `sample_ratio_factor` 参数来调整每个子数据集的采样比例。
+在 [CombinedDataset](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/datasets/dataset_wrappers.py#L15)  中，我们提供了 `sample_ratio_factor` 参数来调整每个子数据集的采样比例。
 
 例如：
 

From 472c5abf8a80b20f00c094a0f17e96f7ef7a0b1e Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Thu, 27 Jul 2023 00:27:34 +0800
Subject: [PATCH 27/37] [Fix] Fix new config bug in train.py (#2575)

---
 tools/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/train.py b/tools/train.py
index 1fd423ad3f..84eec2d577 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -98,7 +98,8 @@ def merge_args(cfg, args):
     if args.amp is True:
         from mmengine.optim import AmpOptimWrapper, OptimWrapper
         optim_wrapper = cfg.optim_wrapper.get('type', OptimWrapper)
-        assert optim_wrapper in (OptimWrapper, AmpOptimWrapper), \
+        assert optim_wrapper in (OptimWrapper, AmpOptimWrapper,
+                                 'OptimWrapper', 'AmpOptimWrapper'), \
             '`--amp` is not supported custom optimizer wrapper type ' \
             f'`{optim_wrapper}.'
         cfg.optim_wrapper.type = 'AmpOptimWrapper'

From b5f79ad5a2ef56cd13a4cdd78a7deff8eb7c36b9 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Thu, 27 Jul 2023 10:33:57 +0800
Subject: [PATCH 28/37] [Fix] Configs' of MotionBERT (#2574)

---
 configs/body_3d_keypoint/pose_lift/README.md     |  6 +++---
 .../pose_lift/h36m/motionbert_h36m.md            |  8 ++++----
 .../pose_lift/h36m/motionbert_h36m.yml           | 16 +++++++++++++---
 ...otionbert-243frm_8xb32-240e_h36m-original.py} |  2 +-
 ...se-lift_motionbert-243frm_8xb32-240e_h36m.py} |  2 +-
 ...onbert-ft-243frm_8xb32-120e_h36m-original.py} |  2 +-
 ...lift_motionbert-ft-243frm_8xb32-120e_h36m.py} |  2 +-
 7 files changed, 24 insertions(+), 14 deletions(-)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py => pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_motionbert-243frm_8xb32-120e_h36m.py => pose-lift_motionbert-243frm_8xb32-240e_h36m.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py => pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py} (98%)
 rename configs/body_3d_keypoint/pose_lift/h36m/{pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py => pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py} (98%)

diff --git a/configs/body_3d_keypoint/pose_lift/README.md b/configs/body_3d_keypoint/pose_lift/README.md
index 66cd0548ae..ac224a5f56 100644
--- a/configs/body_3d_keypoint/pose_lift/README.md
+++ b/configs/body_3d_keypoint/pose_lift/README.md
@@ -25,15 +25,15 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 | [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9  |  38.0   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
 | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
 | [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |  27.7   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m.py) | 35.3  |  27.7   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
 | [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py) | 27.5  |  21.6   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
 
 #### Human3.6m Dataset from official repo <sup>1</sup>
 
 | Arch                                                           | MPJPE | Average MPJPE | P-MPJPE |                              ckpt                               | log |              Details and Download               |
 | :------------------------------------------------------------- | :---: | :-----------: | :-----: | :-------------------------------------------------------------: | :-: | :---------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |  /  | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
 
 <sup>1</sup> Please refer to the [doc](./h36m/motionbert_h36m.md) for more details.
 
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
index f23aa13a2e..fcd5d2051e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -40,15 +40,15 @@ Testing results on Human3.6M dataset with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 34.5  |     34.6      |  27.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 26.9  |     26.8      |  21.0   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m.py) | 34.5  |     34.6      |  27.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py) | 26.9  |     26.8      |  21.0   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
 
 Testing results on Human3.6M dataset converted from the [official repo](https://github.com/Walter0807/MotionBERT)<sup>1</sup> with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
 
 <sup>1</sup> By default, we test models with [Human 3.6m dataset](/docs/en/dataset_zoo/3d_body_keypoint.md#human3-6m) processed by MMPose. The official repo's dataset includes more data and applies a different pre-processing technique. To achieve the same result with the official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50` and test with the configs we provided.
 
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
index 11ab4bb382..63e1947b0f 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
@@ -5,12 +5,12 @@ Collections:
     URL: https://arxiv.org/abs/2210.06551
   README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md
 Models:
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-240e_h36m.py
   In Collection: MotionBERT
   Metadata:
     Architecture: &id001
     - MotionBERT
-    Training Data: Human3.6M
+    Training Data: Human3.6M (MotionBERT)
   Name: vid_pl_motionbert_8xb32-120e_h36m
   Results:
   - Dataset: Human3.6M
@@ -18,12 +18,17 @@ Models:
       MPJPE: 34.5
       P-MPJPE: 27.1
     Task: Body 3D Keypoint
+  - Dataset: Human3.6M (MotionBERT)
+    Metrics:
+      MPJPE: 39.8
+      P-MPJPE: 33.4
+    Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
 - Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft_8xb32-120e_h36m.py
   In Collection: MotionBERT
   Metadata:
     Architecture: *id001
-    Training Data: Human3.6M
+    Training Data: Human3.6M (MotionBERT)
   Name: vid_pl_motionbert-finetuned_8xb32-120e_h36m
   Results:
   - Dataset: Human3.6M
@@ -31,4 +36,9 @@ Models:
       MPJPE: 26.9
       P-MPJPE: 21.0
     Task: Body 3D Keypoint
+  - Dataset: Human3.6M (MotionBERT)
+    Metrics:
+      MPJPE: 37.7
+      P-MPJPE: 32.2
+    Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py
index 032188f389..caf2e56530 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m-original.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=120, val_interval=10)
+train_cfg = dict(max_epochs=240, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m.py
index 25b9d216a2..ea91556198 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-240e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=120, val_interval=10)
+train_cfg = dict(max_epochs=240, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py
index 9c2aa3697a..555fd8ae0e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m-original.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=60, val_interval=10)
+train_cfg = dict(max_epochs=120, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py
index 5c42e62a60..256a765539 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-120e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=60, val_interval=10)
+train_cfg = dict(max_epochs=120, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(

From 93c57234fae1b3565a645b10093cc24877c01fb1 Mon Sep 17 00:00:00 2001
From: Yifan Lareina WU <mhsj16lareina@gmail.com>
Date: Thu, 27 Jul 2023 12:51:47 +0800
Subject: [PATCH 29/37] [Enhance] Normalization option in 3d human pose demo
 and inferencer (#2576)

---
 demo/body3d_pose_lifter_demo.py              | 14 ++++++++------
 demo/docs/en/3d_human_pose_demo.md           |  2 +-
 demo/inferencer_demo.py                      | 10 +++++-----
 docs/en/user_guides/inference.md             |  2 +-
 docs/zh_cn/user_guides/inference.md          |  2 +-
 mmpose/apis/inferencers/mmpose_inferencer.py |  2 +-
 mmpose/apis/inferencers/pose3d_inferencer.py |  2 +-
 7 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index b5c19e8916..7ed63fe3c9 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -66,12 +66,13 @@ def parse_args():
         'is useful for visualization when the model do not predict the '
         'global position of the 3D pose.')
     parser.add_argument(
-        '--norm-pose-2d',
+        '--disable-norm-pose-2d',
         action='store_true',
-        help='Scale the bbox (along with the 2D pose) to the average bbox '
-        'scale of the dataset, and move the bbox (along with the 2D pose) to '
-        'the average bbox center of the dataset. This is useful when bbox '
-        'is small, especially in multi-person scenarios.')
+        default=False,
+        help='Whether to scale the bbox (along with the 2D pose) to the '
+        'average bbox scale of the dataset, and move the bbox (along with the '
+        '2D pose) to the average bbox center of the dataset. This is useful '
+        'when bbox is small, especially in multi-person scenarios.')
     parser.add_argument(
         '--num-instances',
         type=int,
@@ -256,11 +257,12 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
         step=pose_lift_dataset.get('seq_step', 1))
 
     # 2D-to-3D pose lifting
+    norm_pose_2d = not args.disable_norm_pose_2d
     pose_lift_results = inference_pose_lifter_model(
         pose_lifter,
         pose_seq_2d,
         image_size=visualize_frame.shape[:2],
-        norm_pose_2d=args.norm_pose_2d)
+        norm_pose_2d=norm_pose_2d)
 
     # post-processing
     for idx, pose_lift_result in enumerate(pose_lift_results):
diff --git a/demo/docs/en/3d_human_pose_demo.md b/demo/docs/en/3d_human_pose_demo.md
index b46c740de6..176ee06489 100644
--- a/demo/docs/en/3d_human_pose_demo.md
+++ b/demo/docs/en/3d_human_pose_demo.md
@@ -19,7 +19,7 @@ ${MMPOSE_CHECKPOINT_FILE_3D} \
 --input ${VIDEO_PATH or IMAGE_PATH or 'webcam'} \
 [--show] \
 [--disable-rebase-keypoint] \
-[--norm-pose-2d] \
+[--disable-norm-pose-2d] \
 [--num-instances ${NUM_INSTANCES}] \
 [--output-root ${OUT_VIDEO_ROOT}] \
 [--save-predictions] \
diff --git a/demo/inferencer_demo.py b/demo/inferencer_demo.py
index 7053768e69..0ab816e9fb 100644
--- a/demo/inferencer_demo.py
+++ b/demo/inferencer_demo.py
@@ -97,12 +97,12 @@ def parse_args():
         action='store_true',
         help='Whether to use OKS as similarity in tracking')
     parser.add_argument(
-        '--norm-pose-2d',
+        '--disable-norm-pose-2d',
         action='store_true',
-        help='Scale the bbox (along with the 2D pose) to the average bbox '
-        'scale of the dataset, and move the bbox (along with the 2D pose) to '
-        'the average bbox center of the dataset. This is useful when bbox '
-        'is small, especially in multi-person scenarios.')
+        help='Whether to scale the bbox (along with the 2D pose) to the '
+        'average bbox scale of the dataset, and move the bbox (along with the '
+        '2D pose) to the average bbox center of the dataset. This is useful '
+        'when bbox is small, especially in multi-person scenarios.')
     parser.add_argument(
         '--disable-rebase-keypoint',
         action='store_true',
diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md
index 127c52ff74..d05e5cd15c 100644
--- a/docs/en/user_guides/inference.md
+++ b/docs/en/user_guides/inference.md
@@ -247,7 +247,7 @@ The inferencer is designed for both visualization and saving predictions. The ta
 | `skeleton_style`          | Sets the skeleton style. Options include 'mmpose' (default) and 'openpose'.                                                                                       | ✔️  | ❌  |
 | `use_oks_tracking`        | Decides whether to use OKS as a similarity measure in tracking.                                                                                                   | ❌  | ✔️  |
 | `tracking_thr`            | Sets the similarity threshold for tracking.                                                                                                                       | ❌  | ✔️  |
-| `norm_pose_2d`            | Decides whether to scale the bounding box to the dataset's average bounding box scale and relocate the bounding box to the dataset's average bounding box center. | ❌  | ✔️  |
+| `disable_norm_pose_2d`    | Decides whether to scale the bounding box to the dataset's average bounding box scale and relocate the bounding box to the dataset's average bounding box center. | ❌  | ✔️  |
 | `disable_rebase_keypoint` | Decides whether to set the lowest keypoint with height 0.                                                                                                         | ❌  | ✔️  |
 | `num_instances`           | Sets the number of instances to visualize in the results. If set to a negative number, all detected instances will be visualized.                                 | ❌  | ✔️  |
 | `return_vis`              | Decides whether to include visualization images in the results.                                                                                                   | ✔️  | ✔️  |
diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md
index f8efa74dc9..26ef366d46 100644
--- a/docs/zh_cn/user_guides/inference.md
+++ b/docs/zh_cn/user_guides/inference.md
@@ -235,7 +235,7 @@ result = next(result_generator)
 | `skeleton_style`          | 设置骨架样式。可选项包括 'mmpose'（默认）和 'openpose'。                                                                   | ✔️  | ❌  |
 | `use_oks_tracking`        | 决定是否在追踪中使用OKS作为相似度测量。                                                                                    | ❌  | ✔️  |
 | `tracking_thr`            | 设置追踪的相似度阈值。                                                                                                     | ❌  | ✔️  |
-| `norm_pose_2d`            | 决定是否将边界框缩放至数据集的平均边界框尺寸，并将边界框移至数据集的平均边界框中心。                                       | ❌  | ✔️  |
+| `disable_norm_pose_2d`    | 决定是否将边界框缩放至数据集的平均边界框尺寸，并将边界框移至数据集的平均边界框中心。                                       | ❌  | ✔️  |
 | `disable_rebase_keypoint` | 决定是否将最低关键点的高度置为 0。                                                                                         | ❌  | ✔️  |
 | `num_instances`           | 设置可视化结果中显示的实例数量。如果设置为负数，则所有实例的结果都会可视化。                                               | ❌  | ✔️  |
 | `return_vis`              | 决定是否在结果中包含可视化图像。                                                                                           | ✔️  | ✔️  |
diff --git a/mmpose/apis/inferencers/mmpose_inferencer.py b/mmpose/apis/inferencers/mmpose_inferencer.py
index 3ec958223f..77d117d47a 100644
--- a/mmpose/apis/inferencers/mmpose_inferencer.py
+++ b/mmpose/apis/inferencers/mmpose_inferencer.py
@@ -56,7 +56,7 @@ class MMPoseInferencer(BaseMMPoseInferencer):
 
     preprocess_kwargs: set = {
         'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
-        'norm_pose_2d'
+        'disable_norm_pose_2d'
     }
     forward_kwargs: set = {'disable_rebase_keypoint'}
     visualize_kwargs: set = {
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 472f43bee2..94832958a4 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -69,7 +69,7 @@ class Pose3DInferencer(BaseMMPoseInferencer):
 
     preprocess_kwargs: set = {
         'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
-        'norm_pose_2d'
+        'disable_norm_pose_2d'
     }
     forward_kwargs: set = {'disable_rebase_keypoint'}
     visualize_kwargs: set = {

From abe09d3bb41138c08f0d565c9cb95772271ac1c0 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Thu, 27 Jul 2023 12:52:07 +0800
Subject: [PATCH 30/37] [Fix] Fix the incorrect labels for training vis_head
 with combined datasets (#2550)

---
 .../topdown_heatmap/coco/resnet_coco.md       |   6 +
 ...res50_8xb64-210e_coco-aic-256x192-merge.py | 167 ++++++++++++++++++
 .../advanced_guides/implement_new_models.md   |  24 +++
 .../advanced_guides/implement_new_models.md   |  24 +++
 .../datasets/transforms/common_transforms.py  |  20 +--
 mmpose/datasets/transforms/converting.py      |  15 +-
 mmpose/datasets/transforms/formatting.py      |   6 +-
 mmpose/models/heads/hybrid_heads/vis_head.py  |  46 +++--
 mmpose/models/losses/classification_loss.py   |   8 +-
 mmpose/models/pose_estimators/bottomup.py     |   3 +
 mmpose/models/pose_estimators/topdown.py      |   3 +
 mmpose/structures/keypoint/transforms.py      |  16 +-
 mmpose/visualization/local_visualizer.py      |  38 ++--
 .../test_transforms/test_converting.py        |  12 +-
 14 files changed, 315 insertions(+), 73 deletions(-)
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md
index 4ce6da38c6..dbe14267ed 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md
@@ -60,3 +60,9 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 | [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py) |  384x288   | 0.749 |      0.906      |      0.817      | 0.799 |      0.941      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) |
 | [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py) |  256x192   | 0.736 |      0.904      |      0.818      | 0.791 |      0.942      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192_20220928.log) |
 | [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py) |  384x288   | 0.750 |      0.908      |      0.821      | 0.800 |      0.942      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288_20220927.log) |
+
+The following model is equipped with a visibility prediction head and has been trained using COCO and AIC datasets.
+
+| Arch                                          | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |                     ckpt                      |                      log                      |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py) |  256x192   | 0.729 |      0.900      |      0.807      | 0.783 |      0.938      | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge-21815b2c_20230726.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) |
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py
new file mode 100644
index 0000000000..f5def39ed9
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py
@@ -0,0 +1,167 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+    type='Adam',
+    lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+    type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+    ),
+    head=dict(
+        type='VisPredictHead',
+        loss=dict(
+            type='BCELoss',
+            use_target_weight=True,
+            use_sigmoid=True,
+            loss_weight=1e-3,
+        ),
+        pose_cfg=dict(
+            type='HeatmapHead',
+            in_channels=2048,
+            out_channels=17,
+            loss=dict(type='KeypointMSELoss', use_target_weight=True),
+            decoder=codec)),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=True,
+    ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='annotations/person_keypoints_train2017.json',
+    data_prefix=dict(img='train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type='AicDataset',
+    data_root='data/aic/',
+    data_mode=data_mode,
+    ann_file='annotations/aic_train.json',
+    data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+                     'keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=17,
+            mapping=[
+                (0, 6),
+                (1, 8),
+                (2, 10),
+                (3, 5),
+                (4, 7),
+                (5, 9),
+                (6, 12),
+                (7, 14),
+                (8, 16),
+                (9, 11),
+                (10, 13),
+                (11, 15),
+            ])
+    ],
+)
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+        datasets=[dataset_coco, dataset_aic],
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    # score_mode='bbox',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/docs/en/advanced_guides/implement_new_models.md b/docs/en/advanced_guides/implement_new_models.md
index da46a99e39..ff54e2c5ff 100644
--- a/docs/en/advanced_guides/implement_new_models.md
+++ b/docs/en/advanced_guides/implement_new_models.md
@@ -79,3 +79,27 @@ class YourNewHead(BaseHead):
 ```
 
 Finally, please remember to import your new prediction head in `[__init__.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py)` .
+
+### Head with Keypoints Visibility Prediction
+
+Many models predict keypoint visibility based on confidence in coordinate predictions. However, this approach is suboptimal. Our [`VisPredictHead`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) wrapper enables heads to directly predict keypoint visibility from ground truth training data, improving reliability. To add visibility prediction, wrap your head module with VisPredictHead in the config file.
+
+```python
+model=dict(
+     ...
+     head=dict(
+          type='VisPredictHead',
+          loss=dict(
+               type='BCELoss',
+               use_target_weight=True,
+               use_sigmoid=True,
+               loss_weight=1e-3),
+          pose_cfg=dict(
+               type='HeatmapHead',
+               in_channels=2048,
+               out_channels=17,
+               loss=dict(type='KeypointMSELoss', use_target_weight=True),
+               decoder=codec)),
+     ...
+)
+```
diff --git a/docs/zh_cn/advanced_guides/implement_new_models.md b/docs/zh_cn/advanced_guides/implement_new_models.md
index d3ed96bd37..22e866b52b 100644
--- a/docs/zh_cn/advanced_guides/implement_new_models.md
+++ b/docs/zh_cn/advanced_guides/implement_new_models.md
@@ -78,3 +78,27 @@ class YourNewHead(BaseHead):
 ```
 
 最后，请记得在 [heads/\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py) 中导入你的新预测头部。
+
+### 关键点可见性预测头部
+
+许多模型都是通过对关键点坐标预测的置信度来判断关键点的可见性的。然而，这种解决方案并非最优。我们提供了一个叫做 [`VisPredictHead`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) 的头部模块包装器，使得头部模块能够直接预测关键点的可见性。这个包装器是用训练数据中关键点可见性真值来训练的。因此，其预测会更加可靠。用户可以通过修改配置文件来对自己的头部模块加上这个包装器。下面是一个例子:
+
+```python
+model=dict(
+     ...
+     head=dict(
+          type='VisPredictHead',
+          loss=dict(
+               type='BCELoss',
+               use_target_weight=True,
+               use_sigmoid=True,
+               loss_weight=1e-3),
+          pose_cfg=dict(
+               type='HeatmapHead',
+               in_channels=2048,
+               out_channels=17,
+               loss=dict(type='KeypointMSELoss', use_target_weight=True),
+               decoder=codec)),
+     ...
+)
+```
diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py
index 87068246f8..8f7aa05425 100644
--- a/mmpose/datasets/transforms/common_transforms.py
+++ b/mmpose/datasets/transforms/common_transforms.py
@@ -340,7 +340,7 @@ def _random_select_half_body(self, keypoints_visible: np.ndarray,
 
         Args:
             keypoints_visible (np.ndarray, optional): The visibility of
-                keypoints in shape (N, K, 1).
+                keypoints in shape (N, K, 1) or (N, K, 2).
             upper_body_ids (list): The list of upper body keypoint indices
             lower_body_ids (list): The list of lower body keypoint indices
 
@@ -349,6 +349,9 @@ def _random_select_half_body(self, keypoints_visible: np.ndarray,
             of each instance. ``None`` means not applying half-body transform.
         """
 
+        if keypoints_visible.ndim == 3:
+            keypoints_visible = keypoints_visible[..., 0]
+
         half_body_ids = []
 
         for visible in keypoints_visible:
@@ -390,7 +393,6 @@ def transform(self, results: Dict) -> Optional[dict]:
         Returns:
             dict: The result dict.
         """
-
         half_body_ids = self._random_select_half_body(
             keypoints_visible=results['keypoints_visible'],
             upper_body_ids=results['upper_body_ids'],
@@ -952,6 +954,10 @@ def transform(self, results: Dict) -> Optional[dict]:
                 ' \'keypoints\' in the results.')
 
         keypoints_visible = results['keypoints_visible']
+        if keypoints_visible.ndim == 3 and keypoints_visible.shape[2] == 2:
+            keypoints_visible, keypoints_visible_weights = \
+                keypoints_visible[..., 0], keypoints_visible[..., 1]
+            results['keypoints_visible_weights'] = keypoints_visible_weights
 
         # Encoded items from the encoder(s) will be updated into the results.
         # Please refer to the document of the specific codec for details about
@@ -1031,16 +1037,6 @@ def transform(self, results: Dict) -> Optional[dict]:
 
         results.update(encoded)
 
-        if results.get('keypoint_weights', None) is not None:
-            results['transformed_keypoints_visible'] = results[
-                'keypoint_weights']
-        elif results.get('keypoints', None) is not None:
-            results['transformed_keypoints_visible'] = results[
-                'keypoints_visible']
-        else:
-            raise ValueError('GenerateTarget requires \'keypoint_weights\' or'
-                             ' \'keypoints_visible\' in the results.')
-
         return results
 
     def __repr__(self) -> str:
diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index 38dcea0994..c8a4a172cf 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -87,13 +87,18 @@ def __init__(self, num_keypoints: int,
         self.interpolation = interpolation
 
     def transform(self, results: dict) -> dict:
+        """Transforms the keypoint results to match the target keypoints."""
         num_instances = results['keypoints'].shape[0]
 
+        # Initialize output arrays
         keypoints = np.zeros((num_instances, self.num_keypoints, 2))
         keypoints_visible = np.zeros((num_instances, self.num_keypoints))
 
-        # When paired source_indexes are input,
-        # perform interpolation with self.source_index and self.source_index2
+        # Create a mask to weight visibility loss
+        keypoints_visible_weights = keypoints_visible.copy()
+        keypoints_visible_weights[:, self.target_index] = 1.0
+
+        # Interpolate keypoints if pairs of source indexes provided
         if self.interpolation:
             keypoints[:, self.target_index] = 0.5 * (
                 results['keypoints'][:, self.source_index] +
@@ -102,6 +107,8 @@ def transform(self, results: dict) -> dict:
             keypoints_visible[:, self.target_index] = results[
                 'keypoints_visible'][:, self.source_index] * \
                 results['keypoints_visible'][:, self.source_index2]
+
+        # Otherwise just copy from the source index
         else:
             keypoints[:,
                       self.target_index] = results['keypoints'][:, self.
@@ -109,8 +116,10 @@ def transform(self, results: dict) -> dict:
             keypoints_visible[:, self.target_index] = results[
                 'keypoints_visible'][:, self.source_index]
 
+        # Update the results dict
         results['keypoints'] = keypoints
-        results['keypoints_visible'] = keypoints_visible
+        results['keypoints_visible'] = np.stack(
+            [keypoints_visible, keypoints_visible_weights], axis=2)
         return results
 
     def __repr__(self) -> str:
diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py
index d047cff3c3..c2431c70bf 100644
--- a/mmpose/datasets/transforms/formatting.py
+++ b/mmpose/datasets/transforms/formatting.py
@@ -128,7 +128,7 @@ class PackPoseInputs(BaseTransform):
         'keypoint_y_labels': 'keypoint_y_labels',
         'keypoint_weights': 'keypoint_weights',
         'instance_coords': 'instance_coords',
-        'transformed_keypoints_visible': 'keypoints_visible',
+        'keypoints_visible_weights': 'keypoints_visible_weights'
     }
 
     # items in `field_mapping_table` will be packed into
@@ -195,10 +195,6 @@ def transform(self, results: dict) -> dict:
         if self.pack_transformed and 'transformed_keypoints' in results:
             gt_instances.set_field(results['transformed_keypoints'],
                                    'transformed_keypoints')
-        if self.pack_transformed and \
-                'transformed_keypoints_visible' in results:
-            gt_instances.set_field(results['transformed_keypoints_visible'],
-                                   'transformed_keypoints_visible')
 
         data_sample.gt_instances = gt_instances
 
diff --git a/mmpose/models/heads/hybrid_heads/vis_head.py b/mmpose/models/heads/hybrid_heads/vis_head.py
index e9ea271ac5..f95634541b 100644
--- a/mmpose/models/heads/hybrid_heads/vis_head.py
+++ b/mmpose/models/heads/hybrid_heads/vis_head.py
@@ -31,8 +31,7 @@ def __init__(self,
                  pose_cfg: ConfigType,
                  loss: ConfigType = dict(
                      type='BCELoss', use_target_weight=False,
-                     with_logits=True),
-                 use_sigmoid: bool = False,
+                     use_sigmoid=True),
                  init_cfg: OptConfigType = None):
 
         if init_cfg is None:
@@ -54,14 +53,14 @@ def __init__(self,
         self.pose_head = MODELS.build(pose_cfg)
         self.pose_cfg = pose_cfg
 
-        self.use_sigmoid = use_sigmoid
+        self.use_sigmoid = loss.get('use_sigmoid', False)
 
         modules = [
             nn.AdaptiveAvgPool2d(1),
             nn.Flatten(),
             nn.Linear(self.in_channels, self.out_channels)
         ]
-        if use_sigmoid:
+        if self.use_sigmoid:
             modules.append(nn.Sigmoid())
 
         self.vis_head = nn.Sequential(*modules)
@@ -113,7 +112,7 @@ def integrate(self, batch_vis: Tensor,
 
         assert len(pose_pred_instances) == len(batch_vis_np)
         for index, _ in enumerate(pose_pred_instances):
-            pose_pred_instances[index].keypoint_scores = batch_vis_np[index]
+            pose_pred_instances[index].keypoints_visible = batch_vis_np[index]
 
         return pose_pred_instances, pose_pred_fields
 
@@ -176,15 +175,20 @@ def predict(self,
 
         return self.integrate(batch_vis, batch_pose)
 
-    def vis_accuracy(self, vis_pred_outputs, vis_labels):
+    @torch.no_grad()
+    def vis_accuracy(self, vis_pred_outputs, vis_labels, vis_weights=None):
         """Calculate visibility prediction accuracy."""
-        probabilities = torch.sigmoid(torch.flatten(vis_pred_outputs))
+        if not self.use_sigmoid:
+            vis_pred_outputs = torch.sigmoid(vis_pred_outputs)
         threshold = 0.5
-        predictions = (probabilities >= threshold).int()
-        labels = torch.flatten(vis_labels)
-        correct = torch.sum(predictions == labels).item()
-        accuracy = correct / len(labels)
-        return torch.tensor(accuracy)
+        predictions = (vis_pred_outputs >= threshold).float()
+        correct = (predictions == vis_labels).float()
+        if vis_weights is not None:
+            accuracy = (correct * vis_weights).sum(dim=1) / (
+                vis_weights.sum(dim=1, keepdims=True) + 1e-6)
+        else:
+            accuracy = correct.mean(dim=1)
+        return accuracy.mean()
 
     def loss(self,
              feats: Tuple[Tensor],
@@ -203,18 +207,26 @@ def loss(self,
             dict: A dictionary of losses.
         """
         vis_pred_outputs = self.vis_forward(feats)
-        vis_labels = torch.cat([
-            d.gt_instance_labels.keypoint_weights for d in batch_data_samples
-        ])
+        vis_labels = []
+        vis_weights = [] if self.loss_module.use_target_weight else None
+        for d in batch_data_samples:
+            vis_label = d.gt_instance_labels.keypoint_weights.float()
+            vis_labels.append(vis_label)
+            if vis_weights is not None:
+                vis_weights.append(
+                    getattr(d.gt_instance_labels, 'keypoints_visible_weights',
+                            vis_label.new_ones(vis_label.shape)))
+        vis_labels = torch.cat(vis_labels)
+        vis_weights = torch.cat(vis_weights) if vis_weights else None
 
         # calculate vis losses
         losses = dict()
-        loss_vis = self.loss_module(vis_pred_outputs, vis_labels)
+        loss_vis = self.loss_module(vis_pred_outputs, vis_labels, vis_weights)
 
         losses.update(loss_vis=loss_vis)
 
         # calculate vis accuracy
-        acc_vis = self.vis_accuracy(vis_pred_outputs, vis_labels)
+        acc_vis = self.vis_accuracy(vis_pred_outputs, vis_labels, vis_weights)
         losses.update(acc_vis=acc_vis)
 
         # calculate keypoints losses
diff --git a/mmpose/models/losses/classification_loss.py b/mmpose/models/losses/classification_loss.py
index 4605acabd3..5d2a2c7a58 100644
--- a/mmpose/models/losses/classification_loss.py
+++ b/mmpose/models/losses/classification_loss.py
@@ -14,15 +14,17 @@ class BCELoss(nn.Module):
         use_target_weight (bool): Option to use weighted loss.
             Different joint types may have different target weights.
         loss_weight (float): Weight of the loss. Default: 1.0.
-        with_logits (bool): Whether to use BCEWithLogitsLoss. Default: False.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+            before output. Defaults to False.
     """
 
     def __init__(self,
                  use_target_weight=False,
                  loss_weight=1.,
-                 with_logits=False):
+                 use_sigmoid=False):
         super().__init__()
-        self.criterion = F.binary_cross_entropy if not with_logits\
+        self.use_sigmoid = use_sigmoid
+        self.criterion = F.binary_cross_entropy if use_sigmoid \
             else F.binary_cross_entropy_with_logits
         self.use_target_weight = use_target_weight
         self.loss_weight = loss_weight
diff --git a/mmpose/models/pose_estimators/bottomup.py b/mmpose/models/pose_estimators/bottomup.py
index 5400f2478e..e7d2aaef88 100644
--- a/mmpose/models/pose_estimators/bottomup.py
+++ b/mmpose/models/pose_estimators/bottomup.py
@@ -169,6 +169,9 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
 
             pred_instances.keypoints = pred_instances.keypoints / input_size \
                 * input_scale + input_center - 0.5 * input_scale
+            if 'keypoints_visible' not in pred_instances:
+                pred_instances.keypoints_visible = \
+                    pred_instances.keypoint_scores
 
             data_sample.pred_instances = pred_instances
 
diff --git a/mmpose/models/pose_estimators/topdown.py b/mmpose/models/pose_estimators/topdown.py
index 89b332893f..0704627bd5 100644
--- a/mmpose/models/pose_estimators/topdown.py
+++ b/mmpose/models/pose_estimators/topdown.py
@@ -153,6 +153,9 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
 
             pred_instances.keypoints = pred_instances.keypoints / input_size \
                 * bbox_scales + bbox_centers - 0.5 * bbox_scales
+            if 'keypoints_visible' not in pred_instances:
+                pred_instances.keypoints_visible = \
+                    pred_instances.keypoint_scores
 
             if output_keypoint_indices is not None:
                 # select output keypoints with given indices
diff --git a/mmpose/structures/keypoint/transforms.py b/mmpose/structures/keypoint/transforms.py
index b50da4f8fe..bd7274dadf 100644
--- a/mmpose/structures/keypoint/transforms.py
+++ b/mmpose/structures/keypoint/transforms.py
@@ -20,8 +20,8 @@ def flip_keypoints(keypoints: np.ndarray,
     Args:
         keypoints (np.ndarray): Keypoints in shape (..., K, D)
         keypoints_visible (np.ndarray, optional): The visibility of keypoints
-            in shape (..., K, 1). Set ``None`` if the keypoint visibility is
-            unavailable
+            in shape (..., K, 1) or (..., K, 2). Set ``None`` if the keypoint
+            visibility is unavailable
         image_size (tuple): The image shape in [w, h]
         flip_indices (List[int]): The indices of each keypoint's symmetric
             keypoint
@@ -33,11 +33,12 @@ def flip_keypoints(keypoints: np.ndarray,
         - keypoints_flipped (np.ndarray): Flipped keypoints in shape
             (..., K, D)
         - keypoints_visible_flipped (np.ndarray, optional): Flipped keypoints'
-            visibility in shape (..., K, 1). Return ``None`` if the input
-            ``keypoints_visible`` is ``None``
+            visibility in shape (..., K, 1) or (..., K, 2). Return ``None`` if
+            the input ``keypoints_visible`` is ``None``
     """
 
-    assert keypoints.shape[:-1] == keypoints_visible.shape, (
+    ndim = keypoints.ndim
+    assert keypoints.shape[:-1] == keypoints_visible.shape[:ndim - 1], (
         f'Mismatched shapes of keypoints {keypoints.shape} and '
         f'keypoints_visible {keypoints_visible.shape}')
 
@@ -48,9 +49,10 @@ def flip_keypoints(keypoints: np.ndarray,
 
     # swap the symmetric keypoint pairs
     if direction == 'horizontal' or direction == 'vertical':
-        keypoints = keypoints[..., flip_indices, :]
+        keypoints = keypoints.take(flip_indices, axis=ndim - 2)
         if keypoints_visible is not None:
-            keypoints_visible = keypoints_visible[..., flip_indices]
+            keypoints_visible = keypoints_visible.take(
+                flip_indices, axis=ndim - 2)
 
     # flip the keypoints
     w, h = image_size
diff --git a/mmpose/visualization/local_visualizer.py b/mmpose/visualization/local_visualizer.py
index 080e628e33..1eb994f03a 100644
--- a/mmpose/visualization/local_visualizer.py
+++ b/mmpose/visualization/local_visualizer.py
@@ -253,11 +253,6 @@ def _draw_instances_kpts(self,
             keypoints = instances.get('transformed_keypoints',
                                       instances.keypoints)
 
-            if 'keypoint_scores' in instances:
-                scores = instances.keypoint_scores
-            else:
-                scores = np.ones(keypoints.shape[:-1])
-
             if 'keypoints_visible' in instances:
                 keypoints_visible = instances.keypoints_visible
             else:
@@ -265,15 +260,13 @@ def _draw_instances_kpts(self,
 
             if skeleton_style == 'openpose':
                 keypoints_info = np.concatenate(
-                    (keypoints, scores[..., None], keypoints_visible[...,
-                                                                     None]),
-                    axis=-1)
+                    (keypoints, keypoints_visible[..., None]), axis=-1)
                 # compute neck joint
                 neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
                 # neck score when visualizing pred
-                neck[:, 2:4] = np.logical_and(
-                    keypoints_info[:, 5, 2:4] > kpt_thr,
-                    keypoints_info[:, 6, 2:4] > kpt_thr).astype(int)
+                neck[:, 2:3] = np.logical_and(
+                    keypoints_info[:, 5, 2:3] > kpt_thr,
+                    keypoints_info[:, 6, 2:3] > kpt_thr).astype(int)
                 new_keypoints_info = np.insert(
                     keypoints_info, 17, neck, axis=1)
 
@@ -287,11 +280,10 @@ def _draw_instances_kpts(self,
                     new_keypoints_info[:, mmpose_idx]
                 keypoints_info = new_keypoints_info
 
-                keypoints, scores, keypoints_visible = keypoints_info[
-                    ..., :2], keypoints_info[..., 2], keypoints_info[..., 3]
+                keypoints, keypoints_visible = keypoints_info[
+                    ..., :2], keypoints_info[..., 2]
 
-            for kpts, score, visible in zip(keypoints, scores,
-                                            keypoints_visible):
+            for kpts, visible in zip(keypoints, keypoints_visible):
                 kpts = np.array(kpts, copy=False)
 
                 if self.kpt_color is None or isinstance(self.kpt_color, str):
@@ -320,17 +312,16 @@ def _draw_instances_kpts(self,
                     for sk_id, sk in enumerate(self.skeleton):
                         pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
                         pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
-                        if not (visible[sk[0]] and visible[sk[1]]):
-                            continue
 
                         if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0
                                 or pos1[1] >= img_h or pos2[0] <= 0
                                 or pos2[0] >= img_w or pos2[1] <= 0
-                                or pos2[1] >= img_h or score[sk[0]] < kpt_thr
-                                or score[sk[1]] < kpt_thr
+                                or pos2[1] >= img_h or visible[sk[0]] < kpt_thr
+                                or visible[sk[1]] < kpt_thr
                                 or link_color[sk_id] is None):
                             # skip the link that should not be drawn
                             continue
+
                         X = np.array((pos1[0], pos2[0]))
                         Y = np.array((pos1[1], pos2[1]))
                         color = link_color[sk_id]
@@ -339,7 +330,9 @@ def _draw_instances_kpts(self,
                         transparency = self.alpha
                         if self.show_keypoint_weight:
                             transparency *= max(
-                                0, min(1, 0.5 * (score[sk[0]] + score[sk[1]])))
+                                0,
+                                min(1,
+                                    0.5 * (visible[sk[0]] + visible[sk[1]])))
 
                         if skeleton_style == 'openpose':
                             mX = np.mean(X)
@@ -365,8 +358,7 @@ def _draw_instances_kpts(self,
 
                 # draw each point on image
                 for kid, kpt in enumerate(kpts):
-                    if score[kid] < kpt_thr or not visible[
-                            kid] or kpt_color[kid] is None:
+                    if visible[kid] < kpt_thr or kpt_color[kid] is None:
                         # skip the point that should not be drawn
                         continue
 
@@ -375,7 +367,7 @@ def _draw_instances_kpts(self,
                         color = tuple(int(c) for c in color)
                     transparency = self.alpha
                     if self.show_keypoint_weight:
-                        transparency *= max(0, min(1, score[kid]))
+                        transparency *= max(0, min(1, visible[kid]))
                     self.draw_circles(
                         kpt,
                         radius=np.array([self.radius]),
diff --git a/tests/test_datasets/test_transforms/test_converting.py b/tests/test_datasets/test_transforms/test_converting.py
index 09f06e1e65..08561b1d0f 100644
--- a/tests/test_datasets/test_transforms/test_converting.py
+++ b/tests/test_datasets/test_transforms/test_converting.py
@@ -32,8 +32,10 @@ def test_transform(self):
             self.assertTrue((results['keypoints'][:, target_index] ==
                              self.data_info['keypoints'][:,
                                                          source_index]).all())
+            self.assertEqual(results['keypoints_visible'].ndim, 3)
+            self.assertEqual(results['keypoints_visible'].shape[2], 2)
             self.assertTrue(
-                (results['keypoints_visible'][:, target_index] ==
+                (results['keypoints_visible'][:, target_index, 0] ==
                  self.data_info['keypoints_visible'][:, source_index]).all())
 
         # 2-to-1 mapping
@@ -58,8 +60,10 @@ def test_transform(self):
                     (results['keypoints'][:, target_index] == 0.5 *
                      (self.data_info['keypoints'][:, source_index] +
                       self.data_info['keypoints'][:, source_index2])).all())
+                self.assertEqual(results['keypoints_visible'].ndim, 3)
+                self.assertEqual(results['keypoints_visible'].shape[2], 2)
                 self.assertTrue(
-                    (results['keypoints_visible'][:, target_index] ==
+                    (results['keypoints_visible'][:, target_index, 0] ==
                      self.data_info['keypoints_visible'][:, source_index] *
                      self.data_info['keypoints_visible'][:,
                                                          source_index2]).all())
@@ -67,7 +71,9 @@ def test_transform(self):
                 self.assertTrue(
                     (results['keypoints'][:, target_index] ==
                      self.data_info['keypoints'][:, source_index]).all())
+                self.assertEqual(results['keypoints_visible'].ndim, 3)
+                self.assertEqual(results['keypoints_visible'].shape[2], 2)
                 self.assertTrue(
-                    (results['keypoints_visible'][:, target_index] ==
+                    (results['keypoints_visible'][:, target_index, 0] ==
                      self.data_info['keypoints_visible'][:,
                                                          source_index]).all())

From 3c031adc8397c736db9fce2939352991387837fd Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Thu, 27 Jul 2023 22:21:39 +0800
Subject: [PATCH 31/37] [Enhance] Enhance 3dpose demo and docs (#2578)

---
 README.md                                     |  8 +--
 README_CN.md                                  |  8 +--
 demo/body3d_pose_lifter_demo.py               | 48 ++++++++++++++-
 .../advanced_guides/implement_new_models.md   | 61 ++++++++++++++++++-
 docs/en/guide_to_framework.md                 |  8 +++
 docs/src/papers/algorithms/motionbert.md      | 30 +++++++++
 .../advanced_guides/implement_new_models.md   | 59 +++++++++++++++++-
 docs/zh_cn/guide_to_framework.md              |  8 +++
 projects/rtmpose/README.md                    | 15 +++++
 projects/rtmpose/README_CN.md                 | 15 +++++
 10 files changed, 248 insertions(+), 12 deletions(-)
 create mode 100644 docs/src/papers/algorithms/motionbert.md

diff --git a/README.md b/README.md
index b250d570b3..028149c244 100644
--- a/README.md
+++ b/README.md
@@ -213,7 +213,7 @@ We provided a series of tutorials about the basic usage of MMPose for new users:
 Results and models are available in the **README.md** of each method's config directory.
 A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/latest/model_zoo.html) page.
 
-<details close>
+<details open>
 <summary><b>Supported algorithms:</b></summary>
 
 - [x] [DeepPose](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
@@ -240,7 +240,7 @@ A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/lates
 
 </details>
 
-<details close>
+<details open>
 <summary><b>Supported techniques:</b></summary>
 
 - [x] [FPN](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
@@ -255,7 +255,7 @@ A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/lates
 
 </details>
 
-<details close>
+<details open>
 <summary><b>Supported datasets:</b></summary>
 
 - [x] [AFLW](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/datasets.html#aflw-iccvw-2011) \[[homepage](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
@@ -294,7 +294,7 @@ A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/lates
 
 </details>
 
-<details close>
+<details open>
 <summary><b>Supported backbones:</b></summary>
 
 - [x] [AlexNet](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
diff --git a/README_CN.md b/README_CN.md
index 48672c2a88..39fd649e3d 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -211,7 +211,7 @@ MMPose v1.0.0 是一个重大更新，包括了大量的 API 和配置文件的
 各个模型的结果和设置都可以在对应的 config（配置）目录下的 **README.md** 中查看。
 整体的概况也可也在 [模型库](https://mmpose.readthedocs.io/zh_CN/latest/model_zoo.html) 页面中查看。
 
-<details close>
+<details open>
 <summary><b>支持的算法</b></summary>
 
 - [x] [DeepPose](https://mmpose.readthedocs.io/zh_CN/latest/model_zoo_papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
@@ -238,7 +238,7 @@ MMPose v1.0.0 是一个重大更新，包括了大量的 API 和配置文件的
 
 </details>
 
-<details close>
+<details open>
 <summary><b>支持的技术</b></summary>
 
 - [x] [FPN](https://mmpose.readthedocs.io/zh_CN/latest/model_zoo_papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
@@ -253,7 +253,7 @@ MMPose v1.0.0 是一个重大更新，包括了大量的 API 和配置文件的
 
 </details>
 
-<details close>
+<details open>
 <summary><b>支持的数据集</b></summary>
 
 - [x] [AFLW](https://mmpose.readthedocs.io/zh_CN/latest/model_zoo_papers/datasets.html#aflw-iccvw-2011) \[[主页](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
@@ -292,7 +292,7 @@ MMPose v1.0.0 是一个重大更新，包括了大量的 API 和配置文件的
 
 </details>
 
-<details close>
+<details open>
 <summary><b>支持的骨干网络</b></summary>
 
 - [x] [AlexNet](https://mmpose.readthedocs.io/zh_CN/latest/model_zoo_papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index 7ed63fe3c9..d04fca9f3b 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -137,6 +137,45 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
                       visualize_frame, visualizer):
     """Visualize detected and predicted keypoints of one image.
 
+    Pipeline of this function:
+
+                              frame
+                                |
+                                V
+                        +-----------------+
+                        |     detector    |
+                        +-----------------+
+                                |  det_result
+                                V
+                        +-----------------+
+                        |  pose_estimator |
+                        +-----------------+
+                                |  pose_est_results
+                                V
+            +--------------------------------------------+
+            |  convert 2d kpts into pose-lifting format  |
+            +--------------------------------------------+
+                                |  pose_est_results_list
+                                V
+                    +-----------------------+
+                    | extract_pose_sequence |
+                    +-----------------------+
+                                |  pose_seq_2d
+                                V
+                         +-------------+
+                         | pose_lifter |
+                         +-------------+
+                                |  pose_lift_results
+                                V
+                       +-----------------+
+                       | post-processing |
+                       +-----------------+
+                                |  pred_3d_data_samples
+                                V
+                         +------------+
+                         | visualizer |
+                         +------------+
+
     Args:
         args (Argument): Custom command-line arguments.
         detector (mmdet.BaseDetector): The mmdet detector.
@@ -170,10 +209,13 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
     """
     pose_lift_dataset = pose_lifter.cfg.test_dataloader.dataset
 
+    # First stage: conduct 2D pose detection in a Topdown manner
+    # use detector to obtain person bounding boxes
     det_result = inference_detector(detector, frame)
     pred_instance = det_result.pred_instances.cpu().numpy()
 
-    # First stage: 2D pose detection
+    # filter out the person instances with category and bbox threshold
+    # e.g. 0 for person in COCO
     bboxes = pred_instance.bboxes
     bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id,
                                    pred_instance.scores > args.bbox_thr)]
@@ -190,6 +232,8 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
     pose_det_dataset = pose_estimator.cfg.test_dataloader.dataset
     pose_est_results_converted = []
 
+    # convert 2d pose estimation results into the format for pose-lifting
+    # such as changing the keypoint order, flipping the keypoint, etc.
     for i, data_sample in enumerate(pose_est_results):
         pred_instances = data_sample.pred_instances.cpu().numpy()
         keypoints = pred_instances.keypoints
@@ -256,7 +300,7 @@ def process_one_image(args, detector, frame, frame_idx, pose_estimator,
         seq_len=pose_lift_dataset.get('seq_len', 1),
         step=pose_lift_dataset.get('seq_step', 1))
 
-    # 2D-to-3D pose lifting
+    # conduct 2D-to-3D pose lifting
     norm_pose_2d = not args.disable_norm_pose_2d
     pose_lift_results = inference_pose_lifter_model(
         pose_lifter,
diff --git a/docs/en/advanced_guides/implement_new_models.md b/docs/en/advanced_guides/implement_new_models.md
index ff54e2c5ff..03018de2fb 100644
--- a/docs/en/advanced_guides/implement_new_models.md
+++ b/docs/en/advanced_guides/implement_new_models.md
@@ -82,7 +82,7 @@ Finally, please remember to import your new prediction head in `[__init__.py](ht
 
 ### Head with Keypoints Visibility Prediction
 
-Many models predict keypoint visibility based on confidence in coordinate predictions. However, this approach is suboptimal. Our [`VisPredictHead`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) wrapper enables heads to directly predict keypoint visibility from ground truth training data, improving reliability. To add visibility prediction, wrap your head module with VisPredictHead in the config file.
+Many models predict keypoint visibility based on confidence in coordinate predictions. However, this approach is suboptimal. Our [VisPredictHead](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) wrapper enables heads to directly predict keypoint visibility from ground truth training data, improving reliability. To add visibility prediction, wrap your head module with VisPredictHead in the config file.
 
 ```python
 model=dict(
@@ -103,3 +103,62 @@ model=dict(
      ...
 )
 ```
+
+To implement such a head module wrapper, we only need to inherit [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py), then pass the pose head configuration in `__init__()` and instantiate it through `MODELS.build()`. As shown below:
+
+```python
+@MODELS.register_module()
+class VisPredictHead(BaseHead):
+    """VisPredictHead must be used together with other heads. It can predict
+    keypoints coordinates of and their visibility simultaneously. In the
+    current version, it only supports top-down approaches.
+
+    Args:
+        pose_cfg (Config): Config to construct keypoints prediction head
+        loss (Config): Config for visibility loss. Defaults to use
+            :class:`BCELoss`
+        use_sigmoid (bool): Whether to use sigmoid activation function
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+    """
+
+    def __init__(self,
+                 pose_cfg: ConfigType,
+                 loss: ConfigType = dict(
+                     type='BCELoss', use_target_weight=False,
+                     use_sigmoid=True),
+                 init_cfg: OptConfigType = None):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = pose_cfg['in_channels']
+        if pose_cfg.get('num_joints', None) is not None:
+            self.out_channels = pose_cfg['num_joints']
+        elif pose_cfg.get('out_channels', None) is not None:
+            self.out_channels = pose_cfg['out_channels']
+        else:
+            raise ValueError('VisPredictHead requires \'num_joints\' or'
+                             ' \'out_channels\' in the pose_cfg.')
+
+        self.loss_module = MODELS.build(loss)
+
+        self.pose_head = MODELS.build(pose_cfg)
+        self.pose_cfg = pose_cfg
+
+        self.use_sigmoid = loss.get('use_sigmoid', False)
+
+        modules = [
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.Linear(self.in_channels, self.out_channels)
+        ]
+        if self.use_sigmoid:
+            modules.append(nn.Sigmoid())
+
+        self.vis_head = nn.Sequential(*modules)
+```
+
+Then you can implement other parts of the code as a normal head.
diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
index 5d3b5513a6..5f743d5bf7 100644
--- a/docs/en/guide_to_framework.md
+++ b/docs/en/guide_to_framework.md
@@ -684,3 +684,11 @@ def loss(self,
 
     return losses
 ```
+
+```{note}
+If you wish to learn more about the implementation of Model, like:
+- Head with Keypoints Visibility Prediction
+- Pose Lifting Models
+
+please refer to [Advanced Guides - Implement New Model](./advanced_guides/implement_new_models.md) for more details.
+```
diff --git a/docs/src/papers/algorithms/motionbert.md b/docs/src/papers/algorithms/motionbert.md
new file mode 100644
index 0000000000..9ebe9ae010
--- /dev/null
+++ b/docs/src/papers/algorithms/motionbert.md
@@ -0,0 +1,30 @@
+# MotionBERT: Unified Pretraining for Human Motion Analysis
+
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2210.06551">MotionBERT (ICCV'2023)</a></summary>
+
+```bibtex
+ @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022,
+ title={Learning Human Motion Representations: A Unified Perspective},
+ author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou},
+ year={2022},
+ month={Oct},
+ language={en-US}
+ }
+```
+
+</details>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present MotionBERT, a unified pretraining framework, to tackle different sub-tasks of human motion analysis including 3D pose estimation, skeleton-based action recognition, and mesh recovery. The proposed framework is capable of utilizing all kinds of human motion data resources, including motion capture data and in-the-wild videos. During pretraining, the pretext task requires the motion encoder to recover the underlying 3D motion from noisy partial 2D observations. The pretrained motion representation thus acquires geometric, kinematic, and physical knowledge about human motion and therefore can be easily transferred to multiple downstream tasks. We implement the motion encoder with a novel Dual-stream Spatio-temporal Transformer (DSTformer) neural network. It could capture long-range spatio-temporal relationships among the skeletal joints comprehensively and adaptively, exemplified by the lowest 3D pose estimation error so far when trained from scratch. More importantly, the proposed framework achieves state-of-the-art performance on all three downstream tasks by simply finetuning the pretrained motion encoder with 1-2 linear layers, which demonstrates the versatility of the learned motion representations.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmpose/assets/13503330/877d47ee-b821-476c-a805-f39ca656913c">
+</div>
diff --git a/docs/zh_cn/advanced_guides/implement_new_models.md b/docs/zh_cn/advanced_guides/implement_new_models.md
index 22e866b52b..4f83a4fb8f 100644
--- a/docs/zh_cn/advanced_guides/implement_new_models.md
+++ b/docs/zh_cn/advanced_guides/implement_new_models.md
@@ -81,7 +81,7 @@ class YourNewHead(BaseHead):
 
 ### 关键点可见性预测头部
 
-许多模型都是通过对关键点坐标预测的置信度来判断关键点的可见性的。然而，这种解决方案并非最优。我们提供了一个叫做 [`VisPredictHead`](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) 的头部模块包装器，使得头部模块能够直接预测关键点的可见性。这个包装器是用训练数据中关键点可见性真值来训练的。因此，其预测会更加可靠。用户可以通过修改配置文件来对自己的头部模块加上这个包装器。下面是一个例子:
+许多模型都是通过对关键点坐标预测的置信度来判断关键点的可见性的。然而，这种解决方案并非最优。我们提供了一个叫做 [VisPredictHead](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/models/heads/hybrid_heads/vis_head.py) 的头部模块包装器，使得头部模块能够直接预测关键点的可见性。这个包装器是用训练数据中关键点可见性真值来训练的。因此，其预测会更加可靠。用户可以通过修改配置文件来对自己的头部模块加上这个包装器。下面是一个例子:
 
 ```python
 model=dict(
@@ -102,3 +102,60 @@ model=dict(
      ...
 )
 ```
+
+要实现这样一个预测头部模块包装器，我们只需要像定义正常的预测头部一样，继承 [BaseHead](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/base_head.py)，然后在 `__init__()` 中传入关键点定位的头部配置，并通过 `MODELS.build()` 进行实例化。如下所示：
+
+```python
+@MODELS.register_module()
+class VisPredictHead(BaseHead):
+    """VisPredictHead must be used together with other heads. It can predict
+    keypoints coordinates of and their visibility simultaneously. In the
+    current version, it only supports top-down approaches.
+
+    Args:
+        pose_cfg (Config): Config to construct keypoints prediction head
+        loss (Config): Config for visibility loss. Defaults to use
+            :class:`BCELoss`
+        use_sigmoid (bool): Whether to use sigmoid activation function
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+    """
+
+    def __init__(self,
+                 pose_cfg: ConfigType,
+                 loss: ConfigType = dict(
+                     type='BCELoss', use_target_weight=False,
+                     use_sigmoid=True),
+                 init_cfg: OptConfigType = None):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = pose_cfg['in_channels']
+        if pose_cfg.get('num_joints', None) is not None:
+            self.out_channels = pose_cfg['num_joints']
+        elif pose_cfg.get('out_channels', None) is not None:
+            self.out_channels = pose_cfg['out_channels']
+        else:
+            raise ValueError('VisPredictHead requires \'num_joints\' or'
+                             ' \'out_channels\' in the pose_cfg.')
+
+        self.loss_module = MODELS.build(loss)
+
+        self.pose_head = MODELS.build(pose_cfg)
+        self.pose_cfg = pose_cfg
+
+        self.use_sigmoid = loss.get('use_sigmoid', False)
+
+        modules = [
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.Linear(self.in_channels, self.out_channels)
+        ]
+        if self.use_sigmoid:
+            modules.append(nn.Sigmoid())
+
+        self.vis_head = nn.Sequential(*modules)
+```
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index f9f85c8e61..767683196b 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -697,3 +697,11 @@ def loss(self,
 
     return losses
 ```
+
+```{note}
+如果你想了解更多模型实现的内容，如：
+- 支持关键点可见性预测的头部
+- 2D-to-3D 模型实现
+
+请前往 [【进阶教程 - 实现新模型】](./advanced_guides/implement_new_models.md)
+```
diff --git a/projects/rtmpose/README.md b/projects/rtmpose/README.md
index 41c77a0731..9eda9866df 100644
--- a/projects/rtmpose/README.md
+++ b/projects/rtmpose/README.md
@@ -200,6 +200,21 @@ Feel free to join our community group for more help:
 
 </details>
 
+<details open>
+<summary><b>Human-Art</b></summary>
+
+- Details see [Human-Art](https://github.com/IDEA-Research/HumanArt)
+- <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
+
+|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                             Download                                                              |
+| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------: |
+| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) |
+| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) |
+| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) |
+| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) |
+
+</details>
+
 #### 26 Keypoints
 
 - Keypoints are defined as [Halpe26](https://github.com/Fang-Haoshu/Halpe-FullBody/). For details please refer to the [meta info](/configs/_base_/datasets/halpe26.py).
diff --git a/projects/rtmpose/README_CN.md b/projects/rtmpose/README_CN.md
index bf134ab260..aeb79974a5 100644
--- a/projects/rtmpose/README_CN.md
+++ b/projects/rtmpose/README_CN.md
@@ -191,6 +191,21 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 
 </details>
 
+<details open>
+<summary><b>Human-Art</b></summary>
+
+- Details see [Human-Art](https://github.com/IDEA-Research/HumanArt)
+- <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
+
+|                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                             Download                                                              |
+| :-----------------------------------------------------------------------------: | :--------: | :-----------------------: | :----------------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-----------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------: |
+| [RTMPose-t\*](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) |  256x192   |           65.5            |        3.34        |       0.36        |                    3.20                     |                        1.06                        |                          9.02                           | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) |
+| [RTMPose-s\*](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) |  256x192   |           69.8            |        5.47        |       0.68        |                    4.48                     |                        1.39                        |                          13.89                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) |
+| [RTMPose-m\*](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) |  256x192   |           72.8            |       13.59        |       1.93        |                    11.06                    |                        2.29                        |                          26.44                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) |
+| [RTMPose-l\*](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) |  256x192   |           75.3            |       27.66        |       4.16        |                    18.85                    |                        3.46                        |                          45.37                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) |
+
+</details>
+
 #### 26 Keypoints
 
 - 关键点骨架定义遵循 [Halpe26](https://github.com/Fang-Haoshu/Halpe-FullBody/)，详情见 [meta info](/configs/_base_/datasets/halpe26.py)。

From f7e5342c66eb399c8cad903eab55cfd7b50ca99d Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Fri, 28 Jul 2023 13:16:57 +0800
Subject: [PATCH 32/37] [Docs] Enhance Codecs documents (#2580)

---
 docs/en/advanced_guides/codecs.md    | 224 +++++++++++++++++++++++++++
 docs/zh_cn/advanced_guides/codecs.md | 224 +++++++++++++++++++++++++++
 docs/zh_cn/guide_to_framework.md     |   2 +-
 3 files changed, 449 insertions(+), 1 deletion(-)

diff --git a/docs/en/advanced_guides/codecs.md b/docs/en/advanced_guides/codecs.md
index 7c98ba31d9..22a05c3024 100644
--- a/docs/en/advanced_guides/codecs.md
+++ b/docs/en/advanced_guides/codecs.md
@@ -10,6 +10,8 @@ Here is a diagram to show where the `Codec` is:
 
 ![pose_estimator_en](https://github.com/open-mmlab/mmpose/assets/13503330/0764baab-41c7-4a1d-ab64-5d7f9dfc8eec)
 
+## Basic Concepts
+
 A typical codec consists of two parts:
 
 - Encoder
@@ -225,3 +227,225 @@ test_pipeline = [
     dict(type='PackPoseInputs')
 ]
 ```
+
+## Supported Codecs
+
+Supported codecs are in [$MMPOSE/mmpose/codecs/](https://github.com/open-mmlab/mmpose/tree/dev-1.x/mmpose/codecs). Here is a list:
+
+- [RegressionLabel](#RegressionLabel)
+- [IntegralRegressionLabel](#IntegralRegressionLabel)
+- [MSRAHeatmap](#MSRAHeatmap)
+- [UDPHeatmap](#UDPHeatmap)
+- [MegviiHeatmap](#MegviiHeatmap)
+- [SPR](#SPR)
+- [SimCC](#SimCC)
+- [DecoupledHeatmap](#DecoupledHeatmap)
+- [ImagePoseLifting](#ImagePoseLifting)
+- [VideoPoseLifting](#VideoPoseLifting)
+- [MotionBERTLabel](#MotionBERTLabel)
+
+### RegressionLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/regression_label.py#L12)
+
+The `RegressionLabel` codec is used to generate normalized coordinates as the regression targets.
+
+**Input**
+
+- Encoding keypoints from input image space to normalized space.
+
+**Output**
+
+- Decoding normalized coordinates from normalized space to input image space.
+
+Related works:
+
+- [DeepPose](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#deeppose-cvpr-2014)
+- [RLE](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rle-iccv-2021)
+
+### IntegralRegressionLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/integral_regression_label.py)
+
+The `IntegralRegressionLabel` codec is used to generate normalized coordinates as the regression targets.
+
+**Input**
+
+- Encoding keypoints from input image space to normalized space, and generate Gaussian heatmaps as well.
+
+**Output**
+
+- Decoding normalized coordinates from normalized space to input image space.
+
+Related works:
+
+- [IPR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#ipr-eccv-2018)
+- [DSNT](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#dsnt-2018)
+- [Debias IPR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#debias-ipr-iccv-2021)
+
+### MSRAHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/msra_heatmap.py)
+
+The `MSRAHeatmap` codec is used to generate Gaussian heatmaps as the targets.
+
+**Input**
+
+- Encoding keypoints from input image space to output space as 2D Gaussian heatmaps.
+
+**Output**
+
+- Decoding 2D Gaussian heatmaps from output space to input image space as coordinates.
+
+Related works:
+
+- [SimpleBaseline2D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simplebaseline2d-eccv-2018)
+- [CPM](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#cpm-cvpr-2016)
+- [HRNet](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#hrnet-cvpr-2019)
+- [DARK](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#darkpose-cvpr-2020)
+
+### UDPHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/udp_heatmap.py)
+
+The `UDPHeatmap` codec is used to generate Gaussian heatmaps as the targets.
+
+**Input**
+
+- Encoding keypoints from input image space to output space as 2D Gaussian heatmaps.
+
+**Output**
+
+- Decoding 2D Gaussian heatmaps from output space to input image space as coordinates.
+
+Related works:
+
+- [UDP](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#udp-cvpr-2020)
+
+### MegviiHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/megvii_heatmap.py)
+
+The `MegviiHeatmap` codec is used to generate Gaussian heatmaps as the targets, which is usually used in Megvii's works.
+
+**Input**
+
+- Encoding keypoints from input image space to output space as 2D Gaussian heatmaps.
+
+**Output**
+
+- Decoding 2D Gaussian heatmaps from output space to input image space as coordinates.
+
+Related works:
+
+- [MSPN](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#mspn-arxiv-2019)
+- [RSN](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rsn-eccv-2020)
+
+### SPR
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/spr.py)
+
+The `SPR` codec is used to generate Gaussian heatmaps of instances' center, and offsets as the targets.
+
+**Input**
+
+- Encoding keypoints from input image space to output space as 2D Gaussian heatmaps and offsets.
+
+**Output**
+
+- Decoding 2D Gaussian heatmaps and offsets from output space to input image space as coordinates.
+
+Related works:
+
+- [DEKR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#dekr-cvpr-2021)
+
+### SimCC
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/simcc_label.py)
+
+The `SimCC` codec is used to generate 1D Gaussian representations as the targets.
+
+**Input**
+
+- Encoding keypoints from input image space to output space as 1D Gaussian representations.
+
+**Output**
+
+- Decoding 1D Gaussian representations from output space to input image space as coordinates.
+
+Related works:
+
+- [SimCC](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simcc-eccv-2022)
+- [RTMPose](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rtmpose-arxiv-2023)
+
+### DecoupledHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/decoupled_heatmap.py)
+
+The `DecoupledHeatmap` codec is used to generate Gaussian heatmaps as the targets.
+
+**Input**
+
+- Encoding human center points and keypoints from input image space to output space as 2D Gaussian heatmaps.
+
+**Output**
+
+- Decoding 2D Gaussian heatmaps from output space to input image space as coordinates.
+
+Related works:
+
+- [CID](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#cid-cvpr-2022)
+
+### ImagePoseLifting
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/image_pose_lifting.py)
+
+The `ImagePoseLifting` codec is used for image 2D-to-3D pose lifting.
+
+**Input**
+
+- Encoding 2d keypoints from input image space to normalized 3d space.
+
+**Output**
+
+- Decoding 3d keypoints from normalized space to input image space.
+
+Related works:
+
+- [SimpleBaseline3D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simplebaseline3d-iccv-2017)
+
+### VideoPoseLifting
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/video_pose_lifting.py)
+
+The `VideoPoseLifting` codec is used for video 2D-to-3D pose lifting.
+
+**Input**
+
+- Encoding 2d keypoints from input image space to normalized 3d space.
+
+**Output**
+
+- Decoding 3d keypoints from normalized space to input image space.
+
+Related works:
+
+- [VideoPose3D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#videopose3d-cvpr-2019)
+
+### MotionBERTLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/motionbert_label.py)
+
+The `MotionBERTLabel` codec is used for video 2D-to-3D pose lifting.
+
+**Input**
+
+- Encoding 2d keypoints from input image space to normalized 3d space.
+
+**Output**
+
+- Decoding 3d keypoints from normalized space to input image space.
+
+Related works:
+
+- [MotionBERT](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo/body_3d_keypoint.html#pose-lift-motionbert-on-h36m)
diff --git a/docs/zh_cn/advanced_guides/codecs.md b/docs/zh_cn/advanced_guides/codecs.md
index 60c588a239..83d7361b20 100644
--- a/docs/zh_cn/advanced_guides/codecs.md
+++ b/docs/zh_cn/advanced_guides/codecs.md
@@ -10,6 +10,8 @@ MMPose 1.0 中引入了新模块 **编解码器（Codec）** ，将关键点数
 
 ![pose_estimator_cn](https://github.com/open-mmlab/mmpose/assets/13503330/0c048f66-b889-4268-937f-71b8753b505f)
 
+## 基本概念
+
 一个编解码器主要包含两个部分：
 
 - 编码器
@@ -225,3 +227,225 @@ test_pipeline = [
     dict(type='PackPoseInputs')
 ]
 ```
+
+## 已支持编解码器列表
+
+编解码器相关的代码位于 [$MMPOSE/mmpose/codecs/](https://github.com/open-mmlab/mmpose/tree/dev-1.x/mmpose/codecs)。目前 MMPose 已支持的编解码器如下所示：
+
+- [RegressionLabel](#RegressionLabel)
+- [IntegralRegressionLabel](#IntegralRegressionLabel)
+- [MSRAHeatmap](#MSRAHeatmap)
+- [UDPHeatmap](#UDPHeatmap)
+- [MegviiHeatmap](#MegviiHeatmap)
+- [SPR](#SPR)
+- [SimCC](#SimCC)
+- [DecoupledHeatmap](#DecoupledHeatmap)
+- [ImagePoseLifting](#ImagePoseLifting)
+- [VideoPoseLifting](#VideoPoseLifting)
+- [MotionBERTLabel](#MotionBERTLabel)
+
+### RegressionLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/regression_label.py#L12)
+
+RegressionLabel 编解码器主要用于 Regression-based 方法，适用于直接把坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为**归一化**的坐标值，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的归一化坐标值解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [DeepPose](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#deeppose-cvpr-2014)
+- [RLE](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rle-iccv-2021)
+
+### IntegralRegressionLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/integral_regression_label.py)
+
+IntegralRegressionLabel 编解码器主要用于 Integral Regression-based 方法，适用于把坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为**归一化**的坐标值，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的归一化坐标值解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [IPR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#ipr-eccv-2018)
+- [DSNT](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#dsnt-2018)
+- [Debias IPR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#debias-ipr-iccv-2021)
+
+### MSRAHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/msra_heatmap.py)
+
+MSRAHeatmap 编解码器主要用于 Heatmap-based 方法，适用于把高斯热图作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 2D 离散高斯分布，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 2D 高斯分布解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [SimpleBaseline2D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simplebaseline2d-eccv-2018)
+- [CPM](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#cpm-cvpr-2016)
+- [HRNet](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#hrnet-cvpr-2019)
+- [DARK](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#darkpose-cvpr-2020)
+
+### UDPHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/udp_heatmap.py)
+
+UDPHeatmap 编解码器主要用于 Heatmap-based 方法，适用于把高斯热图作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 2D 离散高斯分布，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 2D 高斯分布解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [UDP](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#udp-cvpr-2020)
+
+### MegviiHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/megvii_heatmap.py)
+
+MegviiHeatmap 编解码器主要用于 Megvii 提出的 Heatmap-based 方法，适用于把高斯热图作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 2D 离散高斯分布，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 2D 高斯分布解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [MSPN](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#mspn-arxiv-2019)
+- [RSN](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rsn-eccv-2020)
+
+### SPR
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/spr.py)
+
+SPR 编解码器主要用于 DEKR 方法，适用于同时使用中心 Heatmap 和偏移坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的中心关键点坐标值编码为 2D 离散高斯分布，以及相对于中心的偏移，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 2D 高斯分布与偏移进行组合，解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [DEKR](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#dekr-cvpr-2021)
+
+### SimCC
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/simcc_label.py)
+
+SimCC 编解码器主要用于 SimCC-based 方法，适用于两个 1D 离散分布表征的 x 和 y 坐标作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为水平和竖直方向 1D 离散分布，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 1D 离散分布解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [SimCC](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simcc-eccv-2022)
+- [RTMPose](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#rtmpose-arxiv-2023)
+
+### DecoupledHeatmap
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/decoupled_heatmap.py)
+
+DecoupledHeatmap 编解码器主要用于 CID 方法，适用于把高斯热图作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的人体中心坐标值和关键点坐标值编码为 2D 离散高斯分布，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的人体中心与关键点 2D 高斯分布解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [CID](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#cid-cvpr-2022)
+
+### ImagePoseLifting
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/image_pose_lifting.py)
+
+ImagePoseLifting 编解码器主要用于 2D-to-3D pose lifting 方法，适用于把单张图片的 2D 坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 3D 坐标空间归一化的坐标值，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 3D 坐标空间归一化的坐标值解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [SimpleBaseline3D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#simplebaseline3d-iccv-2017)
+
+### VideoPoseLifting
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/video_pose_lifting.py)
+
+VideoPoseLifting 编解码器主要用于 2D-to-3D pose lifting 方法，适用于把视频中一组 2D 坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 3D 坐标空间归一化的坐标值，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 3D 坐标空间归一化的坐标值解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [VideoPose3D](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo_papers/algorithms.html#videopose3d-cvpr-2019)
+
+### MotionBERTLabel
+
+[\[Github\]](https://github.com/open-mmlab/mmpose/blob/dev-1.x/mmpose/codecs/motionbert_label.py)
+
+MotionBERTLabel 编解码器主要用于 2D-to-3D pose lifting 方法，适用于把视频中一组 2D 坐标值作为训练目标的场景。
+
+**输入：**
+
+- 将**输入图片尺度**的坐标值编码为 3D 坐标空间归一化的坐标值，用于训练目标的生成。
+
+**输出：**
+
+- 将模型输出的 3D 坐标空间归一化的坐标值解码为**输入图片尺度**的坐标值。
+
+常见的使用此编解码器的算法有：
+
+- [MotionBERT](https://mmpose.readthedocs.io/zh_CN/dev-1.x/model_zoo/body_3d_keypoint.html#pose-lift-motionbert-on-h36m)
diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md
index 767683196b..3b85a9fc6b 100644
--- a/docs/zh_cn/guide_to_framework.md
+++ b/docs/zh_cn/guide_to_framework.md
@@ -167,7 +167,7 @@ dataset_info = dict(
 - `joint_weights`：每个关键点的权重，用于损失函数计算。
 - `sigma`：标准差，用于计算 OKS 分数，详细信息请参考 [keypoints-eval](https://cocodataset.org/#keypoints-eval)。
 
-在模型配置文件中，你需要为自定义数据集指定对应的元信息配置文件。假如该元信息配置文件路径为 `$MMPOSE/configs/\_base\_/datasets/custom.py`，指定方式如下：
+在模型配置文件中，你需要为自定义数据集指定对应的元信息配置文件。假如该元信息配置文件路径为 `$MMPOSE/configs/_base_/datasets/custom.py`，指定方式如下：
 
 ```python
 # dataset and dataloader settings

From 0afcba1b991fd79c84333d6be37932c119f6f455 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Fri, 28 Jul 2023 20:08:33 +0800
Subject: [PATCH 33/37] [Feature] Add DWPose distilled WholeBody RTMPose models
 (#2581)

---
 projects/rtmpose/README.md                    |  28 ++-
 projects/rtmpose/README_CN.md                 |  28 ++-
 ...ose-s_8xb64-270e_coco-wholebody-256x192.py | 233 ++++++++++++++++++
 ...ose-t_8xb64-270e_coco-wholebody-256x192.py | 233 ++++++++++++++++++
 4 files changed, 520 insertions(+), 2 deletions(-)
 create mode 100644 projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py
 create mode 100644 projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py

diff --git a/projects/rtmpose/README.md b/projects/rtmpose/README.md
index 9eda9866df..c52f4a6970 100644
--- a/projects/rtmpose/README.md
+++ b/projects/rtmpose/README.md
@@ -44,6 +44,9 @@ ______________________________________________________________________
 
 ## 🥳 🚀 What's New [🔝](#-table-of-contents)
 
+- Jul. 2023:
+  - Support 17-keypoint Body models trained on Human-Art.
+  - Support 133-keypoint WholeBody models trained on combined datasets.
 - Jun. 2023:
   - Release 26-keypoint Body models trained on combined datasets.
 - May. 2023:
@@ -203,7 +206,7 @@ Feel free to join our community group for more help:
 <details open>
 <summary><b>Human-Art</b></summary>
 
-- Details see [Human-Art](https://github.com/IDEA-Research/HumanArt)
+- RTMPose for Human-Centric Artificial Scenes is supported by  [Human-Art](https://github.com/IDEA-Research/HumanArt)
 - <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
 
 |                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                             Download                                                              |
@@ -248,6 +251,9 @@ For more details, please refer to [GroupFisher Pruning for RTMPose](./rtmpose/pr
 - Keypoints are defined as [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/). For details please refer to the [meta info](/configs/_base_/datasets/coco_wholebody.py).
 - <img src="https://user-images.githubusercontent.com/100993824/227770977-c8f00355-c43a-467e-8444-d307789cf4b2.png" height="300px">
 
+<details open>
+<summary><b>COCO-WholeBody</b></summary>
+
 | Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
 | :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
 | [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   58.2   |   67.4   |       2.22        |                    13.50                    |                        4.00                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) |
@@ -255,6 +261,26 @@ For more details, please refer to [GroupFisher Pruning for RTMPose](./rtmpose/pr
 | [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   64.8   |   73.0   |       10.07       |                    44.58                    |                        7.68                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) |
 | [RTMPose-x](./rtmpose/wholebody_2d_keypoint/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   65.3   |   73.3   |       18.1        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-coco-wholebody_pt-body7_270e-384x288-401dfc90_20230629.pth) |
 
+</details>
+
+<details open>
+<summary><b>DWPose</b></summary>
+
+- DWPose Models are supported by [DWPose](https://github.com/IDEA-Research/DWPose)
+- Models are trained and distilled on:
+  - [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/)
+  - [UBody](https://github.com/IDEA-Research/OSX)
+
+| Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
+| :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
+| [RTMPose-t](./rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   48.5   |   58.4   |       2.22        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-ucoco_dw-ucoco_270e-256x192-dcf277bf_20230728.pth) |
+| [RTMPose-s](./rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   53.8   |   63.2   |       4.52        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-ucoco_dw-ucoco_270e-256x192-3fd922c8_20230728.pth) |
+| [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   60.6   |   69.5   |       2.22        |                    13.50                    |                        4.00                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth) |
+| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   63.1   |   71.7   |       4.52        |                    23.41                    |                        5.67                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth) |
+| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   66.5   |   74.3   |       10.07       |                    44.58                    |                        7.68                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-384x288-2438fd99_20230728.pth) |
+
+</details>
+
 ### Animal 2d (17 Keypoints)
 
 - Keypoints are defined as [AP-10K](https://github.com/AlexTheBad/AP-10K/). For details please refer to the [meta info](/configs/_base_/datasets/ap10k.py).
diff --git a/projects/rtmpose/README_CN.md b/projects/rtmpose/README_CN.md
index aeb79974a5..bed9809020 100644
--- a/projects/rtmpose/README_CN.md
+++ b/projects/rtmpose/README_CN.md
@@ -40,6 +40,9 @@ ______________________________________________________________________
 
 ## 🥳 最新进展 [🔝](#-table-of-contents)
 
+- 2023 年 7 月：
+  - 支持面向艺术图片人体姿态估计的 17 点 Body 模型。
+  - 支持混合数据集蒸馏训练的 133 点 WholeBody 模型。
 - 2023 年 6 月：
   - 发布混合数据集训练的 26 点 Body 模型。
 - 2023 年 5 月：
@@ -194,7 +197,7 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 <details open>
 <summary><b>Human-Art</b></summary>
 
-- Details see [Human-Art](https://github.com/IDEA-Research/HumanArt)
+- 面向艺术图片的人体姿态估计 RTMPose 模型由 [Human-Art](https://github.com/IDEA-Research/HumanArt) 提供。
 - <img src="https://github.com/open-mmlab/mmpose/assets/13503330/685bc610-dd9e-4e6f-9c41-dbc8220584f4" height="300px">
 
 |                                     Config                                      | Input Size | AP<sup><br>(Human-Art GT) | Params<sup><br>(M) | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) | ncnn-FP16-Latency<sup><br>(ms)<sup><br>(Snapdragon 865) |                                                             Download                                                              |
@@ -239,6 +242,9 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 - 关键点骨架定义遵循 [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/)，详情见 [meta info](/configs/_base_/datasets/coco_wholebody.py)。
 - <img src="https://user-images.githubusercontent.com/100993824/227770977-c8f00355-c43a-467e-8444-d307789cf4b2.png" height="300px">
 
+<details open>
+<summary><b>COCO-WholeBody</b></summary>
+
 | Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
 | :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
 | [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   58.2   |   67.4   |       2.22        |                    13.50                    |                        4.00                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) |
@@ -246,6 +252,26 @@ RTMPose 是一个长期优化迭代的项目，致力于业务场景下的高性
 | [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   64.8   |   73.0   |       10.07       |                    44.58                    |                        7.68                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) |
 | [RTMPose-x](./rtmpose/wholebody_2d_keypoint/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   65.3   |   73.3   |       18.1        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-coco-wholebody_pt-body7_270e-384x288-401dfc90_20230629.pth) |
 
+</details>
+
+<details open>
+<summary><b>DWPose</b></summary>
+
+- DWPose 模型由 [DWPose](https://github.com/IDEA-Research/DWPose) 项目提供
+- 模型在以下数据集上训练并蒸馏:
+  - [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/)
+  - [UBody](https://github.com/IDEA-Research/OSX)
+
+| Config                          | Input Size | Whole AP | Whole AR | FLOPS<sup><br>(G) | ORT-Latency<sup><br>(ms)<sup><br>(i7-11700) | TRT-FP16-Latency<sup><br>(ms)<sup><br>(GTX 1660Ti) |             Download              |
+| :------------------------------ | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :-------------------------------: |
+| [RTMPose-t](./rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   48.5   |   58.4   |       2.22        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-ucoco_dw-ucoco_270e-256x192-dcf277bf_20230728.pth) |
+| [RTMPose-s](./rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   53.8   |   63.2   |       4.52        |                      -                      |                         -                          | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-ucoco_dw-ucoco_270e-256x192-3fd922c8_20230728.pth) |
+| [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   60.6   |   69.5   |       2.22        |                    13.50                    |                        4.00                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth) |
+| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) |  256x192   |   63.1   |   71.7   |       4.52        |                    23.41                    |                        5.67                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth) |
+| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) |  384x288   |   66.5   |   74.3   |       10.07       |                    44.58                    |                        7.68                        | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-384x288-2438fd99_20230728.pth) |
+
+</details>
+
 ### 动物 2d 关键点 (17 Keypoints)
 
 - 关键点骨架定义遵循 [AP-10K](https://github.com/AlexTheBad/AP-10K/)，详情见 [meta info](/configs/_base_/datasets/ap10k.py)。
diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py
new file mode 100644
index 0000000000..7afb493d6e
--- /dev/null
+++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-s_8xb64-270e_coco-wholebody-256x192.py
@@ -0,0 +1,233 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=input_size,
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth'  # noqa
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=512,
+        out_channels=num_keypoints,
+        input_size=codec['input_size'],
+        in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/coco_wholebody_train_v1.0.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py
new file mode 100644
index 0000000000..3ea3de877b
--- /dev/null
+++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-t_8xb64-270e_coco-wholebody-256x192.py
@@ -0,0 +1,233 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    clip_grad=dict(max_norm=35, norm_type=2),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=input_size,
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.167,
+        widen_factor=0.375,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth'  # noqa
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=384,
+        out_channels=num_keypoints,
+        input_size=codec['input_size'],
+        in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=train_batch_size,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/coco_wholebody_train_v1.0.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=val_batch_size,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator

From 79f1ec585b805af213534653d9d338ea99a87782 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Sat, 29 Jul 2023 00:45:59 +0800
Subject: [PATCH 34/37] [Docs] Add deployment docs (#2582)

---
 docs/en/advanced_guides/how_to_deploy.md      |   3 -
 .../advanced_guides/implement_new_models.md   |   4 +-
 docs/en/advanced_guides/model_analysis.md     |   4 +-
 docs/en/index.rst                             |   2 +-
 docs/en/user_guides/how_to_deploy.md          | 294 ++++++++++++++++++
 docs/zh_cn/advanced_guides/how_to_deploy.md   |   3 -
 .../advanced_guides/implement_new_models.md   |   2 +
 docs/zh_cn/advanced_guides/model_analysis.md  |   6 +-
 docs/zh_cn/index.rst                          |   2 +-
 docs/zh_cn/user_guides/how_to_deploy.md       | 292 +++++++++++++++++
 10 files changed, 597 insertions(+), 15 deletions(-)
 delete mode 100644 docs/en/advanced_guides/how_to_deploy.md
 create mode 100644 docs/en/user_guides/how_to_deploy.md
 delete mode 100644 docs/zh_cn/advanced_guides/how_to_deploy.md
 create mode 100644 docs/zh_cn/user_guides/how_to_deploy.md

diff --git a/docs/en/advanced_guides/how_to_deploy.md b/docs/en/advanced_guides/how_to_deploy.md
deleted file mode 100644
index b4fead876c..0000000000
--- a/docs/en/advanced_guides/how_to_deploy.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# How to Deploy MMPose Models
-
-Coming soon.
diff --git a/docs/en/advanced_guides/implement_new_models.md b/docs/en/advanced_guides/implement_new_models.md
index 03018de2fb..7e73cfcbf4 100644
--- a/docs/en/advanced_guides/implement_new_models.md
+++ b/docs/en/advanced_guides/implement_new_models.md
@@ -58,7 +58,7 @@ from .base_backbone import BaseBackbone
 class YourNewBackbone(BaseBackbone):
 ```
 
-Finally, please remember to import your new backbone network in `[__init__.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py)` .
+Finally, please remember to import your new backbone network in [\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/backbones/__init__.py) .
 
 ## Heads
 
@@ -78,7 +78,7 @@ from ..base_head import BaseHead
 class YourNewHead(BaseHead):
 ```
 
-Finally, please remember to import your new prediction head in `[__init__.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py)` .
+Finally, please remember to import your new prediction head in [\_\_init\_\_.py](https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/heads/__init__.py).
 
 ### Head with Keypoints Visibility Prediction
 
diff --git a/docs/en/advanced_guides/model_analysis.md b/docs/en/advanced_guides/model_analysis.md
index e10bb634a6..2050c01a1a 100644
--- a/docs/en/advanced_guides/model_analysis.md
+++ b/docs/en/advanced_guides/model_analysis.md
@@ -2,7 +2,7 @@
 
 ## Get Model Params & FLOPs
 
-MMPose provides `tools/analysis_tools/get_flops.py` to get model parameters and FLOPs.
+MMPose provides [tools/analysis_tools/get_flops.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/analysis_tools/get_flops.py) to get model parameters and FLOPs.
 
 ```shell
 python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] [--cfg-options ${CFG_OPTIONS}]
@@ -42,7 +42,7 @@ This tool is still experimental and we do not guarantee that the number is absol
 
 ## Log Analysis
 
-MMPose provides `tools/analysis_tools/analyze_logs.py` to analyze the training log. The log file can be either a json file or a text file. The json file is recommended, because it is more convenient to parse and visualize.
+MMPose provides [tools/analysis_tools/analyze_logs.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/analysis_tools/analyze_logs.py) to analyze the training log. The log file can be either a json file or a text file. The json file is recommended, because it is more convenient to parse and visualize.
 
 Currently, the following functions are supported:
 
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 044b54be0f..5cfc1aa451 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -24,6 +24,7 @@ You can change the documentation language at the lower-left corner of the page.
    user_guides/configs.md
    user_guides/prepare_datasets.md
    user_guides/train_and_test.md
+   user_guides/how_to_deploy.md
 
 .. toctree::
    :maxdepth: 1
@@ -36,7 +37,6 @@ You can change the documentation language at the lower-left corner of the page.
    advanced_guides/customize_transforms.md
    advanced_guides/customize_optimizer.md
    advanced_guides/customize_logging.md
-   advanced_guides/how_to_deploy.md
    advanced_guides/model_analysis.md
 
 .. toctree::
diff --git a/docs/en/user_guides/how_to_deploy.md b/docs/en/user_guides/how_to_deploy.md
new file mode 100644
index 0000000000..4c097f47ef
--- /dev/null
+++ b/docs/en/user_guides/how_to_deploy.md
@@ -0,0 +1,294 @@
+# Publish Model and Deployment
+
+This chapter will introduce how to export and deploy models trained with MMPose. It includes the following sections:
+
+- [Model Simplification](#model-simplification)
+- [Deployment with MMDeploy](#deployment-with-mmdeploy)
+  - [Introduction to MMDeploy](#introduction-to-mmdeploy)
+  - [Supported Models](#supported-models)
+  - [Installation](#installation)
+  - [Model Conversion](#model-conversion)
+    - [How to Find the Deployment Configuration File for an MMPose Model](#how-to-find-the-deployment-configuration-file-for-an-mmpose-model)
+    - [RTMPose Model Export Example](#rtmpose-model-export-example)
+    - [ONNX](#onnx)
+    - [TensorRT](#tensorrt)
+    - [Advanced Settings](#advanced-settings)
+  - [Model Profiling](#model-profiling)
+  - [Accuracy Validation](#accuracy-validation)
+
+## Publish Model
+
+By default, the checkpoint file saved during MMPose training contains all the information about the model, including the model structure, weights, optimizer states, etc. This information is redundant for model deployment. Therefore, we need to simplify the model. The simplified `.pth` file can even be less than half the size of the original.
+
+MMPose provides the [tools/misc/publish_model.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/misc/publish_model.py) script for model simplification, which can be used as follows:
+
+```shell
+python tools/misc/publish_model.py ${IN_FILE} ${OUT_FILE}
+```
+
+For example:
+
+```shell
+python tools/misc/publish_model.py ./epoch_10.pth ./epoch_10_publish.pth
+```
+
+The script will automatically simplify the model, save the simplified model to the specified path, and add a timestamp to the filename, for example, `./epoch_10_publish-21815b2c_20230726.pth`.
+
+## Deployment with MMDeploy
+
+### Introduction to MMDeploy
+
+MMDeploy is the OpenMMLab model deployment toolbox, providing a unified deployment experience for various algorithm libraries. With MMDeploy, developers can easily generate SDKs tailored to specific hardware from MMPose, saving a lot of adaptation time.
+
+- You can directly download SDK versions of models (ONNX, TensorRT, ncnn, etc.) from the [ OpenMMLab Deploee](https://platform.openmmlab.com/deploee).
+- We also support [Online Model Conversion](https://platform.openmmlab.com/deploee/task-convert-list), so you don't need to install MMDeploy locally.
+
+For more information and usage guidelines, see the [MMDeploy documentation](https://mmdeploy.readthedocs.io/en/latest/get_started.html).
+
+### Supported Models
+
+| Model                                                                                                     | Task          | ONNX Runtime | TensorRT | ncnn | PPLNN | OpenVINO | CoreML | TorchScript |
+| :-------------------------------------------------------------------------------------------------------- | :------------ | :----------: | :------: | :--: | :---: | :------: | :----: | :---------: |
+| [HRNet](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#hrnet-cvpr-2019)          | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [MSPN](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#mspn-arxiv-2019)           | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [LiteHRNet](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#litehrnet-cvpr-2021)  | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [Hourglass](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/algorithms.html#hourglass-eccv-2016) | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [SimCC](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/algorithms.html#simcc-eccv-2022)         | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [RTMPose](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmpose)                                | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [YoloX-Pose](https://github.com/open-mmlab/mmpose/tree/main/projects/yolox_pose)                          | PoseDetection |      Y       |    Y     |  N   |   N   |    Y     |   Y    |      Y      |
+
+### Installation
+
+Before starting the deployment, you need to make sure that MMPose, MMDetection, and MMDeploy are correctly installed. Please follow the installation instructions below:
+
+- [Installation of MMPose and MMDetection](../installation.md)
+- [Installation of MMDeploy](https://mmdeploy.readthedocs.io/en/latest/04-supported-codebases/mmpose.html)
+
+Depending on the backend you choose for deployment, some backends require **compilation of custom operators** supported by MMDeploy. Please refer to the corresponding documentation to ensure that the environment is set up correctly:
+
+- [ONNX](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/onnxruntime.html)
+- [TensorRT](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/tensorrt.html)
+- [OpenVINO](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/openvino.html)
+- [ncnn](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/ncnn.html)
+- [TorchScript](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/torchscript.html)
+- [More](https://github.com/open-mmlab/mmdeploy/tree/main/docs/en/05-supported-backends)
+
+### Model Conversion
+
+After completing the installation, you can start model deployment. You can use the provided [tools/deploy.py](https://github.com/open-mmlab/mmdeploy/blob/main/tools/deploy.py) script in MMDeploy to easily convert MMPose models to different deployment backends.
+
+Here's how you can use it:
+
+```shell
+python ./tools/deploy.py \
+    ${DEPLOY_CFG_PATH} \
+    ${MODEL_CFG_PATH} \
+    ${MODEL_CHECKPOINT_PATH} \
+    ${INPUT_IMG} \
+    --test-img ${TEST_IMG} \
+    --work-dir ${WORK_DIR} \
+    --calib-dataset-cfg ${CALIB_DATA_CFG} \
+    --device ${DEVICE} \
+    --log-level INFO \
+    --show \
+    --dump-info
+```
+
+Parameter descriptions:
+
+- `deploy_cfg`: Deployment configuration specific to mmdeploy, including inference framework type, quantization, and whether the input shape is dynamic or static. The configuration files may have reference relationships, and `configs/mmpretrain/classification_ncnn_static.py` is an example.
+
+- `model_cfg`: Model configuration specific to the mm algorithm library, e.g., `mmpose/configs/pose-detection_simcc_onnxruntime_dynamic.py`, independent of mmdeploy path.
+
+- `checkpoint`: Path to the torch model. It can be a local file path or a download link (e.g., `http/https`).
+
+- `img`: Path to the test image or point cloud file used for model conversion.
+
+- `--test-img`: Path to the image file used to test the model. Default is set to `None`.
+
+- `--work-dir`: Working directory to save logs and model files.
+
+- `--calib-dataset-cfg`: This parameter only takes effect in `int8` mode and is used for the calibration dataset configuration file. If not provided in `int8` mode, the script will automatically use the 'val' dataset from the model configuration file for calibration.
+
+- `--device`: Device used for model conversion. Default is `cpu`, but for trt, you can use `cuda:0`, for example.
+
+- `--log-level`: Set the log level, with options including 'CRITICAL', 'FATAL', 'ERROR', 'WARN', 'WARNING', 'INFO', 'DEBUG', and 'NOTSET'. Default is `INFO`.
+
+- `--show`: Whether to display the detection results.
+
+- `--dump-info`: Whether to output SDK information.
+
+#### How to Find the Deployment Configuration File for an MMPose Model
+
+1. All deployment configuration files related to MMPose are stored in the [configs/mmpose/](https://github.com/open-mmlab/mmdeploy/tree/main/configs/mmpose) directory.
+2. The naming convention for deployment configuration files is `{Task}_{Algorithm}_{Backend}_{Dynamic/Static}_{Input Size}`.
+
+#### RTMPose Model Export Example
+
+In this section, we demonstrate how to export the RTMPose model in ONNX and TensorRT formats. For more information, refer to the [MMDeploy documentation](https://mmdeploy.readthedocs.io/en/latest/02-how-to-run/convert_model.html).
+
+- ONNX Configuration
+
+  - [pose-detection_simcc_onnxruntime_dynamic.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py)
+
+- TensorRT Configuration
+
+  - [pose-detection_simcc_tensorrt_dynamic-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py)
+
+- More
+
+  |  Backend  | Config                                                                                                                                                               |
+  | :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+  | ncnn-fp16 | [pose-detection_simcc_ncnn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_ncnn-fp16_static-256x192.py) |
+  |  CoreML   | [pose-detection_simcc_coreml_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_coreml_static-256x192.py)       |
+  | OpenVINO  | [pose-detection_simcc_openvino_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_openvino_static-256x192.py)   |
+  |   RKNN    | [pose-detection_simcc_rknn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_rknn-fp16_static-256x192.py) |
+
+If you need to modify the deployment configuration, please refer to the [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/en/latest/02-how-to-run/write_config.html).
+
+The file structure used in this tutorial is as follows:
+
+```shell
+|----mmdeploy
+|----mmpose
+```
+
+##### ONNX
+
+Run the following command:
+
+```shell
+# Go to the mmdeploy directory
+cd ${PATH_TO_MMDEPLOY}
+
+# Convert RTMPose
+# The input model path can be a local path or a download link.
+python tools/deploy.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \
+    demo/resources/human-pose.jpg \
+    --work-dir rtmpose-ort/rtmpose-m \
+    --device cpu \
+    --show \
+    --dump-info   # Export SDK info
+```
+
+The default exported model file is `{work-dir}/end2end.onnx`
+
+##### TensorRT
+
+Run the following command:
+
+```shell
+# Go to the mmdeploy directory
+cd ${PATH_TO_MMDEPLOY}
+
+# Convert RTMPose
+# The input model path can be a local path or a download link.
+python tools/deploy.py \
+    configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \
+    demo/resources/human-pose.jpg \
+    --work-dir rtmpose-trt/rtmpose-m \
+    --device cuda:0 \
+    --show \
+    --dump-info   # Export SDK info
+```
+
+The default exported model file is `{work-dir}/end2end.engine`
+
+If the model is successfully exported, you will see the detection results on the sample image:
+
+![convert_models](https://user-images.githubusercontent.com/13503330/217726963-7815dd01-561a-4605-b0c6-07b6fe1956c3.png)
+
+###### Advanced Settings
+
+If you want to use TensorRT-FP16, you can enable it by modifying the following MMDeploy configuration:
+
+```Python
+# in MMDeploy config
+backend_config = dict(
+    type='tensorrt',
+    common_config=dict(
+        fp16_mode=True  # Enable FP16
+    ))
+```
+
+### Model Profiling
+
+If you want to test the inference speed of the model in the deployment framework, MMDeploy provides a convenient script called `tools/profiler.py`.
+
+You need to prepare a folder containing test images named `./test_images`, and the profiler will randomly extract images from this directory for model profiling.
+
+```shell
+# Go to the mmdeploy directory
+cd ${PATH_TO_MMDEPLOY}
+
+python tools/profiler.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    ../test_images \
+    --model {WORK_DIR}/end2end.onnx \
+    --shape 256x192 \
+    --device cpu \
+    --warmup 50 \
+    --num-iter 200
+```
+
+The profiling results will be displayed as follows:
+
+```shell
+01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS
+01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS
+01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS
+01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS
+01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS
+01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS
+01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS
+01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS
+01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS
+01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS
+----- Settings:
++------------+---------+
+| batch size |    1    |
+|   shape    | 256x192 |
+| iterations |   200   |
+|   warmup   |    50   |
++------------+---------+
+----- Results:
++--------+------------+---------+
+| Stats  | Latency/ms |   FPS   |
++--------+------------+---------+
+|  Mean  |   11.060   |  90.412 |
+| Median |   11.852   |  84.375 |
+|  Min   |   7.812    | 128.007 |
+|  Max   |   13.690   |  73.044 |
++--------+------------+---------+
+```
+
+```{note}
+If you want to learn more about profiler and its more parameter settings and functionality, you can refer to the [Profiler documentation](https://mmdeploy.readthedocs.io/en/main/02-how-to-run/useful_tools.html#profiler).
+```
+
+### Model Accuracy Testing
+
+If you want to test the inference accuracy of the model in the deployment framework, MMDeploy provides a convenient script called `tools/test.py`.
+
+```shell
+# Go to the mmdeploy directory
+cd ${PATH_TO_MMDEPLOY}
+
+python tools/test.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ./mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    --model {PATH_TO_MODEL}/rtmpose_m.pth \
+    --device cpu
+```
+
+```{note}
+For more detailed content, please refer to the [MMDeploy documentation](https://github.com/open-mmlab/mmdeploy/blob/main/docs/en/02-how-to-run/profile_model.md).
+```
+
+With this, you have covered the steps for model simplification and deployment using MMDeploy for MMPose models. It includes converting models to different formats (ONNX, TensorRT, etc.), testing inference speed, and accuracy in the deployment framework.
diff --git a/docs/zh_cn/advanced_guides/how_to_deploy.md b/docs/zh_cn/advanced_guides/how_to_deploy.md
deleted file mode 100644
index b4fead876c..0000000000
--- a/docs/zh_cn/advanced_guides/how_to_deploy.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# How to Deploy MMPose Models
-
-Coming soon.
diff --git a/docs/zh_cn/advanced_guides/implement_new_models.md b/docs/zh_cn/advanced_guides/implement_new_models.md
index 4f83a4fb8f..34fe2ba128 100644
--- a/docs/zh_cn/advanced_guides/implement_new_models.md
+++ b/docs/zh_cn/advanced_guides/implement_new_models.md
@@ -159,3 +159,5 @@ class VisPredictHead(BaseHead):
 
         self.vis_head = nn.Sequential(*modules)
 ```
+
+然后你只需要像一个普通的预测头一样继续实现其余部分即可。
diff --git a/docs/zh_cn/advanced_guides/model_analysis.md b/docs/zh_cn/advanced_guides/model_analysis.md
index 234dc5be85..b88755620e 100644
--- a/docs/zh_cn/advanced_guides/model_analysis.md
+++ b/docs/zh_cn/advanced_guides/model_analysis.md
@@ -1,8 +1,8 @@
-# Model Analysis
+# 模型统计与分析
 
 ## 统计模型参数量与计算量
 
-MMPose 提供了 `tools/analysis_tools/get_flops.py` 来统计模型的参数量与计算量。
+MMPose 提供了 [tools/analysis_tools/get_flops.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/analysis_tools/get_flops.py) 来统计模型的参数量与计算量。
 
 ```shell
 python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] [--cfg-options ${CFG_OPTIONS}]
@@ -42,7 +42,7 @@ Params: 28.54 M
 
 ## 分析训练日志
 
-MMPose 提供了 `tools/analysis_tools/analyze_logs.py` 来对训练日志进行简单的分析，包括：
+MMPose 提供了 [tools/analysis_tools/analyze_logs.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/analysis_tools/analyze_logs.py) 来对训练日志进行简单的分析，包括：
 
 - 将日志绘制成损失和精度曲线图
 - 统计训练速度
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 2431d82e4d..0e2feeb08c 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -24,6 +24,7 @@ You can change the documentation language at the lower-left corner of the page.
    user_guides/configs.md
    user_guides/prepare_datasets.md
    user_guides/train_and_test.md
+   user_guides/how_to_deploy.md
 
 .. toctree::
    :maxdepth: 1
@@ -36,7 +37,6 @@ You can change the documentation language at the lower-left corner of the page.
    advanced_guides/customize_transforms.md
    advanced_guides/customize_optimizer.md
    advanced_guides/customize_logging.md
-   advanced_guides/how_to_deploy.md
    advanced_guides/model_analysis.md
 
 .. toctree::
diff --git a/docs/zh_cn/user_guides/how_to_deploy.md b/docs/zh_cn/user_guides/how_to_deploy.md
new file mode 100644
index 0000000000..a658f0a0a3
--- /dev/null
+++ b/docs/zh_cn/user_guides/how_to_deploy.md
@@ -0,0 +1,292 @@
+# 模型精简与部署
+
+本章将介绍如何导出与部署 MMPose 训练得到的模型，包含以下内容：
+
+- [模型精简](#模型精简)
+- [使用 MMDeploy 部署](#使用-mmdeploy-部署)
+  - [MMDeploy 介绍](#mmdeploy-介绍)
+  - [模型支持列表](#模型支持列表)
+  - [安装](#安装)
+  - [模型转换](#模型转换)
+    - [如何查找 MMPose 模型对应的部署配置文件](#如何查找-mmpose-模型对应的部署配置文件)
+    - [RTMPose 模型导出示例](#rtmpose-模型导出示例)
+    - [ONNX](#onnx)
+    - [TensorRT](#tensorrt)
+    - [高级设置](#高级设置)
+  - [模型测速](#模型测速)
+  - [精度验证](#精度验证)
+
+## 模型精简
+
+在默认状态下，MMPose 训练过程中保存的 checkpoint 文件包含了模型的所有信息，包括模型结构、权重、优化器状态等。这些信息对于模型的部署来说是冗余的，因此我们需要对模型进行精简，精简后的 `.pth` 文件大小甚至能够缩小一半以上。
+
+MMPose 提供了 [tools/misc/publish_model.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/tools/misc/publish_model.py) 来进行模型精简，使用方式如下：
+
+```shell
+python tools/misc/publish_model.py ${IN_FILE} ${OUT_FILE}
+```
+
+例如：
+
+```shell
+python tools/misc/publish_model.py ./epoch_10.pth ./epoch_10_publish.pth
+```
+
+脚本会自动对模型进行精简，并将精简后的模型保存到制定路径，并在文件名的最后加上时间戳，例如 `./epoch_10_publish-21815b2c_20230726.pth`。
+
+## 使用 MMDeploy 部署
+
+### MMDeploy 介绍
+
+MMDeploy 是 OpenMMLab 模型部署工具箱，为各算法库提供统一的部署体验。基于 MMDeploy，开发者可以轻松从 MMPose 生成指定硬件所需 SDK，省去大量适配时间。
+
+- 你可以从 [【硬件模型库】](https://platform.openmmlab.com/deploee) 直接下载 SDK 版模型（ONNX、TensorRT、ncnn 等）。
+- 同时我们也支持 [在线模型转换](https://platform.openmmlab.com/deploee/task-convert-list)，从而无需本地安装 MMDeploy。
+
+更多介绍和使用指南见 [MMDeploy 文档](https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html)。
+
+### 模型支持列表
+
+| Model                                                                                                     | Task          | ONNX Runtime | TensorRT | ncnn | PPLNN | OpenVINO | CoreML | TorchScript |
+| :-------------------------------------------------------------------------------------------------------- | :------------ | :----------: | :------: | :--: | :---: | :------: | :----: | :---------: |
+| [HRNet](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#hrnet-cvpr-2019)          | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [MSPN](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#mspn-arxiv-2019)           | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [LiteHRNet](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/backbones.html#litehrnet-cvpr-2021)  | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [Hourglass](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/algorithms.html#hourglass-eccv-2016) | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [SimCC](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/algorithms.html#simcc-eccv-2022)         | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [RTMPose](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmpose)                                | PoseDetection |      Y       |    Y     |  Y   |   N   |    Y     |   Y    |      Y      |
+| [YoloX-Pose](https://github.com/open-mmlab/mmpose/tree/main/projects/yolox_pose)                          | PoseDetection |      Y       |    Y     |  N   |   N   |    Y     |   Y    |      Y      |
+
+### 安装
+
+在开始部署之前，首先你需要确保正确安装了 MMPose, MMDetection, MMDeploy，相关安装教程如下：
+
+- [安装 MMPose 与 MMDetection](../installation.md)
+- [安装 MMDeploy](https://mmdeploy.readthedocs.io/zh_CN/latest/04-supported-codebases/mmpose.html)
+
+根据部署后端的不同，有的后端需要对 MMDeploy 支持的**自定义算子进行编译**，请根据需求前往对应的文档确保环境搭建正确：
+
+- [ONNX](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/onnxruntime.html)
+- [TensorRT](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/tensorrt.html)
+- [OpenVINO](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/openvino.html)
+- [ncnn](https://mmdeploy.readthedocs.io/zh_CN/latest/05-supported-backends/ncnn.html)
+- [TorchScript](https://mmdeploy.readthedocs.io/en/latest/05-supported-backends/torchscript.html)
+- [更多](https://github.com/open-mmlab/mmdeploy/tree/main/docs/zh_cn/05-supported-backends)
+
+### 模型转换
+
+在完成安装之后，你就可以开始模型部署了。通过 MMDeploy 提供的 [tools/deploy.py](https://github.com/open-mmlab/mmdeploy/blob/main/tools/deploy.py) 可以方便地将 MMPose 模型转换到不同的部署后端。
+
+使用方法如下：
+
+```shell
+python ./tools/deploy.py \
+    ${DEPLOY_CFG_PATH} \
+    ${MODEL_CFG_PATH} \
+    ${MODEL_CHECKPOINT_PATH} \
+    ${INPUT_IMG} \
+    --test-img ${TEST_IMG} \
+    --work-dir ${WORK_DIR} \
+    --calib-dataset-cfg ${CALIB_DATA_CFG} \
+    --device ${DEVICE} \
+    --log-level INFO \
+    --show \
+    --dump-info
+```
+
+参数描述：
+
+- `deploy_cfg` : mmdeploy 针对此模型的部署配置，包含推理框架类型、是否量化、输入 shape 是否动态等。配置文件之间可能有引用关系，configs/mmpretrain/classification_ncnn_static.py 是一个示例。
+
+- `model_cfg` : mm 算法库的模型配置，例如 mmpretrain/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py，与 mmdeploy 的路径无关。
+
+- `checkpoint` : torch 模型路径。可以 http/https 开头，详见 mmcv.FileClient 的实现。
+
+- `img` : 模型转换时，用做测试的图像或点云文件路径。
+
+- `--test-img` : 用于测试模型的图像文件路径。默认设置成None。
+
+- `--work-dir` : 工作目录，用来保存日志和模型文件。
+
+- `--calib-dataset-cfg` : 此参数只有int8模式下生效，用于校准数据集配置文件。若在int8模式下未传入参数，则会自动使用模型配置文件中的’val’数据集进行校准。
+
+- `--device` : 用于模型转换的设备。 默认是cpu，对于 trt 可使用 cuda:0 这种形式。
+
+- `--log-level` : 设置日记的等级，选项包括'CRITICAL'， 'FATAL'， 'ERROR'， 'WARN'， 'WARNING'， 'INFO'， 'DEBUG'， 'NOTSET'。 默认是INFO。
+
+- `--show` : 是否显示检测的结果。
+
+- `--dump-info` : 是否输出 SDK 信息。
+
+#### 如何查找 MMPose 模型对应的部署配置文件
+
+1. 所有与 MMPose 相关的部署配置文件都存放在 [configs/mmpose/](https://github.com/open-mmlab/mmdeploy/tree/main/configs/mmpose) 目录下。
+2. 部署配置文件命名遵循 `{任务}_{算法}_{部署后端}_{动态/静态}_{输入尺寸}` 。
+
+#### RTMPose 模型导出示例
+
+我们本节演示将 RTMPose 模型导出为 ONNX 和 TensorRT 格式，如果你希望了解更多内容请前往 [MMDeploy 文档](https://mmdeploy.readthedocs.io/zh_CN/latest/02-how-to-run/convert_model.html)。
+
+- ONNX 配置
+
+  - [pose-detection_simcc_onnxruntime_dynamic.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py)
+
+- TensorRT 配置
+
+  - [pose-detection_simcc_tensorrt_dynamic-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py)
+
+- 更多
+
+  |  Backend  |                                                                                Config                                                                                |
+  | :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+  | ncnn-fp16 | [pose-detection_simcc_ncnn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_ncnn-fp16_static-256x192.py) |
+  |  CoreML   |    [pose-detection_simcc_coreml_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_coreml_static-256x192.py)    |
+  | OpenVINO  |  [pose-detection_simcc_openvino_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_openvino_static-256x192.py)  |
+  |   RKNN    | [pose-detection_simcc_rknn-fp16_static-256x192.py](https://github.com/open-mmlab/mmdeploy/blob/main/configs/mmpose/pose-detection_simcc_rknn-fp16_static-256x192.py) |
+
+如果你需要对部署配置进行修改，请参考 [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/zh_CN/latest/02-how-to-run/write_config.html).
+
+本教程中使用的文件结构如下：
+
+```shell
+|----mmdeploy
+|----mmpose
+```
+
+##### ONNX
+
+运行如下命令：
+
+```shell
+# 前往 mmdeploy 目录
+cd ${PATH_TO_MMDEPLOY}
+
+# 转换 RTMPose
+# 输入模型路径可以是本地路径，也可以是下载链接。
+python tools/deploy.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \
+    demo/resources/human-pose.jpg \
+    --work-dir rtmpose-ort/rtmpose-m \
+    --device cpu \
+    --show \
+    --dump-info   # 导出 sdk info
+```
+
+默认导出模型文件为 `{work-dir}/end2end.onnx`
+
+##### TensorRT
+
+运行如下命令：
+
+```shell
+# 前往 mmdeploy 目录
+cd ${PATH_TO_MMDEPLOY}
+
+# 转换 RTMPose
+# 输入模型路径可以是本地路径，也可以是下载链接。
+python tools/deploy.py \
+    configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \
+    demo/resources/human-pose.jpg \
+    --work-dir rtmpose-trt/rtmpose-m \
+    --device cuda:0 \
+    --show \
+    --dump-info   # 导出 sdk info
+```
+
+默认导出模型文件为 `{work-dir}/end2end.engine`
+
+如果模型顺利导出，你将会看到样例图片上的检测结果：
+
+![convert_models](https://user-images.githubusercontent.com/13503330/217726963-7815dd01-561a-4605-b0c6-07b6fe1956c3.png)
+
+###### 高级设置
+
+如果需要使用 TensorRT-FP16，你可以通过修改 MMDeploy config 中以下配置开启：
+
+```Python
+# in MMDeploy config
+backend_config = dict(
+    type='tensorrt',
+    common_config=dict(
+        fp16_mode=True  # 打开 fp16
+    ))
+```
+
+### 模型测速
+
+如果需要测试模型在部署框架下的推理速度，MMDeploy 提供了方便的 [tools/profiler.py](https://github.com/open-mmlab/mmdeploy/blob/main/tools/profiler.py) 脚本。
+
+用户需要准备一个存放测试图片的文件夹`./test_images`，profiler 将随机从该目录下抽取图片用于模型测速。
+
+```shell
+# 前往 mmdeploy 目录
+cd ${PATH_TO_MMDEPLOY}
+
+python tools/profiler.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ../mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    ../test_images \
+    --model {WORK_DIR}/end2end.onnx \
+    --shape 256x192 \
+    --device cpu \
+    --warmup 50 \
+    --num-iter 200
+```
+
+测试结果如下：
+
+```shell
+01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS
+01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS
+01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS
+01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS
+01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS
+01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS
+01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS
+01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS
+01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS
+01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS
+----- Settings:
++------------+---------+
+| batch size |    1    |
+|   shape    | 256x192 |
+| iterations |   200   |
+|   warmup   |    50   |
++------------+---------+
+----- Results:
++--------+------------+---------+
+| Stats  | Latency/ms |   FPS   |
++--------+------------+---------+
+|  Mean  |   11.060   |  90.412 |
+| Median |   11.852   |  84.375 |
+|  Min   |   7.812    | 128.007 |
+|  Max   |   13.690   |  73.044 |
++--------+------------+---------+
+```
+
+```{note}
+如果你希望详细了解 profiler 的更多参数设置与功能，可以前往 [Profiler 文档](https://mmdeploy.readthedocs.io/en/main/02-how-to-run/useful_tools.html#profiler)。
+```
+
+### 精度验证
+
+如果需要测试模型在部署框架下的推理精度，MMDeploy 提供了方便的 `tools/test.py` 脚本。
+
+```shell
+# 前往 mmdeploy 目录
+cd ${PATH_TO_MMDEPLOY}
+
+python tools/test.py \
+    configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
+    ./mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \
+    --model {PATH_TO_MODEL}/rtmpose_m.pth \
+    --device cpu
+```
+
+```{note}
+详细内容请参考 [MMDeploy 文档](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/02-how-to-run/profile_model.md)
+```

From 9d08e405b5a4635b449c36c5dc6617c2ab2bd828 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Mon, 31 Jul 2023 11:43:30 +0800
Subject: [PATCH 35/37] [Fix] Refine 3dpose (#2583)

---
 mmpose/codecs/image_pose_lifting.py           | 36 ++++++++++++++-----
 mmpose/codecs/motionbert_label.py             | 29 +++++++++++----
 mmpose/codecs/video_pose_lifting.py           | 17 ++++++---
 .../datasets/base/base_mocap_dataset.py       | 23 ++++++++----
 .../datasets/datasets/body3d/h36m_dataset.py  | 22 +++++++++---
 5 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py
index aae6c3b5be..a5efee99c1 100644
--- a/mmpose/codecs/image_pose_lifting.py
+++ b/mmpose/codecs/image_pose_lifting.py
@@ -61,21 +61,27 @@ def __init__(self,
         self.reshape_keypoints = reshape_keypoints
         self.concat_vis = concat_vis
         if keypoints_mean is not None:
+            assert keypoints_std is not None, 'keypoints_std is None'
             keypoints_mean = np.array(
                 keypoints_mean,
                 dtype=np.float32).reshape(1, num_keypoints, -1)
             keypoints_std = np.array(
                 keypoints_std, dtype=np.float32).reshape(1, num_keypoints, -1)
-            assert keypoints_std is not None
-            assert keypoints_mean.shape == keypoints_std.shape
+
+            assert keypoints_mean.shape == keypoints_std.shape, (
+                f'keypoints_mean.shape {keypoints_mean.shape} != '
+                f'keypoints_std.shape {keypoints_std.shape}')
         if target_mean is not None:
+            assert target_std is not None, 'target_std is None'
             target_dim = num_keypoints - 1 if remove_root else num_keypoints
             target_mean = np.array(
                 target_mean, dtype=np.float32).reshape(1, target_dim, -1)
             target_std = np.array(
                 target_std, dtype=np.float32).reshape(1, target_dim, -1)
-            assert target_std is not None
-            assert target_mean.shape == target_std.shape
+
+            assert target_mean.shape == target_std.shape, (
+                f'target_mean.shape {target_mean.shape} != '
+                f'target_std.shape {target_std.shape}')
         self.keypoints_mean = keypoints_mean
         self.keypoints_std = keypoints_std
         self.target_mean = target_mean
@@ -158,7 +164,11 @@ def encode(self,
                 lifting_target_label, self.root_index, axis=-2)
             lifting_target_visible = np.delete(
                 lifting_target_visible, self.root_index, axis=-2)
-            assert lifting_target_weights.ndim in {2, 3}
+            assert lifting_target_weights.ndim in {
+                2, 3
+            }, (f'lifting_target_weights.ndim {lifting_target_weights.ndim} '
+                'is not in {2, 3}')
+
             axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
             lifting_target_weights = np.delete(
                 lifting_target_weights, self.root_index, axis=axis_to_remove)
@@ -173,14 +183,18 @@ def encode(self,
         # Normalize the 2D keypoint coordinate with mean and std
         keypoint_labels = keypoints.copy()
         if self.keypoints_mean is not None:
-            assert self.keypoints_mean.shape[1:] == keypoints.shape[1:]
+            assert self.keypoints_mean.shape[1:] == keypoints.shape[1:], (
+                f'self.keypoints_mean.shape[1:] {self.keypoints_mean.shape[1:]} '  # noqa
+                f'!= keypoints.shape[1:] {keypoints.shape[1:]}')
             encoded['keypoints_mean'] = self.keypoints_mean.copy()
             encoded['keypoints_std'] = self.keypoints_std.copy()
 
             keypoint_labels = (keypoint_labels -
                                self.keypoints_mean) / self.keypoints_std
         if self.target_mean is not None:
-            assert self.target_mean.shape == lifting_target_label.shape
+            assert self.target_mean.shape == lifting_target_label.shape, (
+                f'self.target_mean.shape {self.target_mean.shape} '
+                f'!= lifting_target_label.shape {lifting_target_label.shape}')
             encoded['target_mean'] = self.target_mean.copy()
             encoded['target_std'] = self.target_std.copy()
 
@@ -188,7 +202,9 @@ def encode(self,
                                     self.target_mean) / self.target_std
 
         # Generate reshaped keypoint coordinates
-        assert keypoint_labels.ndim in {2, 3}
+        assert keypoint_labels.ndim in {
+            2, 3
+        }, (f'keypoint_labels.ndim {keypoint_labels.ndim} is not in {2, 3}')
         if keypoint_labels.ndim == 2:
             keypoint_labels = keypoint_labels[None, ...]
 
@@ -231,7 +247,9 @@ def decode(self,
         keypoints = encoded.copy()
 
         if self.target_mean is not None and self.target_std is not None:
-            assert self.target_mean.shape == keypoints.shape
+            assert self.target_mean.shape == keypoints.shape, (
+                f'self.target_mean.shape {self.target_mean.shape} '
+                f'!= keypoints.shape {keypoints.shape}')
             keypoints = keypoints * self.target_std + self.target_mean
 
         if target_root is not None and target_root.size > 0:
diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py
index 08ff4ccd1a..ddbda362ef 100644
--- a/mmpose/codecs/motionbert_label.py
+++ b/mmpose/codecs/motionbert_label.py
@@ -58,7 +58,9 @@ def __init__(self,
         self.save_index = save_index
         self.concat_vis = concat_vis
         self.rootrel = rootrel
-        assert mode.lower() in {'train', 'test'}
+        assert mode.lower() in {'train', 'test'
+                                }, (f'Unsupported mode {mode}, '
+                                    'mode should be one of ("train", "test").')
         self.mode = mode.lower()
 
     def encode(self,
@@ -119,13 +121,17 @@ def encode(self,
         lifting_target_label = lifting_target.copy()
         keypoint_labels = keypoints.copy()
 
-        assert keypoint_labels.ndim in {2, 3}
+        assert keypoint_labels.ndim in {
+            2, 3
+        }, (f'Keypoint labels should have 2 or 3 dimensions, '
+            f'but got {keypoint_labels.ndim}.')
         if keypoint_labels.ndim == 2:
             keypoint_labels = keypoint_labels[None, ...]
 
         # Normalize the 2D keypoint coordinate with image width and height
         _camera_param = deepcopy(camera_param)
-        assert 'w' in _camera_param and 'h' in _camera_param
+        assert 'w' in _camera_param and 'h' in _camera_param, (
+            'Camera parameters should contain "w" and "h".')
         w, h = _camera_param['w'], _camera_param['h']
         keypoint_labels[
             ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w]
@@ -201,9 +207,14 @@ def decode(
             keypoints[..., 0, :] = 0
 
         if w is not None and w.size > 0:
-            assert w.shape == h.shape
-            assert w.shape[0] == keypoints.shape[0]
-            assert w.ndim in {1, 2}
+            assert w.shape == h.shape, (f'w and h should have the same shape, '
+                                        f'but got {w.shape} and {h.shape}.')
+            assert w.shape[0] == keypoints.shape[0], (
+                f'w and h should have the same batch size, '
+                f'but got {w.shape[0]} and {keypoints.shape[0]}.')
+            assert w.ndim in {1,
+                              2}, (f'w and h should have 1 or 2 dimensions, '
+                                   f'but got {w.ndim}.')
             if w.ndim == 1:
                 w = w[:, None]
                 h = h[:, None]
@@ -211,9 +222,13 @@ def decode(
                 np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :]
             keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2
             keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2
+
         if factor is not None and factor.size > 0:
-            assert factor.shape[0] == keypoints.shape[0]
+            assert factor.shape[0] == keypoints.shape[0], (
+                f'factor should have the same batch size, '
+                f'but got {factor.shape[0]} and {keypoints.shape[0]}.')
             keypoints *= factor[..., None]
+
         keypoints[..., :, :] = keypoints[..., :, :] - keypoints[
             ..., self.root_index:self.root_index + 1, :]
         keypoints /= 1000.
diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py
index 9e409a663c..a692e85806 100644
--- a/mmpose/codecs/video_pose_lifting.py
+++ b/mmpose/codecs/video_pose_lifting.py
@@ -147,7 +147,11 @@ def encode(self,
                     lifting_target_label, self.root_index, axis=-2)
                 lifting_target_visible = np.delete(
                     lifting_target_visible, self.root_index, axis=-2)
-                assert lifting_target_weights.ndim in {2, 3}
+                assert lifting_target_weights.ndim in {
+                    2, 3
+                }, (f'Got invalid lifting target weights shape '
+                    f'{lifting_target_weights.shape}')
+
                 axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
                 lifting_target_weights = np.delete(
                     lifting_target_weights,
@@ -163,19 +167,24 @@ def encode(self,
 
         # Normalize the 2D keypoint coordinate with image width and height
         _camera_param = deepcopy(camera_param)
-        assert 'w' in _camera_param and 'h' in _camera_param
+        assert 'w' in _camera_param and 'h' in _camera_param, (
+            'Camera parameter `w` and `h` should be provided.')
+
         center = np.array([0.5 * _camera_param['w'], 0.5 * _camera_param['h']],
                           dtype=np.float32)
         scale = np.array(0.5 * _camera_param['w'], dtype=np.float32)
 
         keypoint_labels = (keypoints - center) / scale
 
-        assert keypoint_labels.ndim in {2, 3}
+        assert keypoint_labels.ndim in {
+            2, 3
+        }, (f'Got invalid keypoint labels shape {keypoint_labels.shape}')
         if keypoint_labels.ndim == 2:
             keypoint_labels = keypoint_labels[None, ...]
 
         if self.normalize_camera:
-            assert 'f' in _camera_param and 'c' in _camera_param
+            assert 'f' in _camera_param and 'c' in _camera_param, (
+                'Camera parameter `f` and `c` should be provided.')
             _camera_param['f'] = _camera_param['f'] / scale
             _camera_param['c'] = (_camera_param['c'] - center[:, None]) / scale
             encoded['camera_param'] = _camera_param
diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py
index e08ba6ea45..3796653547 100644
--- a/mmpose/datasets/datasets/base/base_mocap_dataset.py
+++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py
@@ -1,15 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
+import logging
 import os.path as osp
 from copy import deepcopy
 from itertools import filterfalse, groupby
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
+import cv2
 import numpy as np
 from mmengine.dataset import BaseDataset, force_full_init
 from mmengine.fileio import exists, get_local_path, load
+from mmengine.logging import print_log
 from mmengine.utils import is_abs
-from PIL import Image
 
 from mmpose.registry import DATASETS
 from ..utils import parse_pose_metainfo
@@ -215,10 +217,13 @@ def get_img_info(self, img_idx, img_name):
         try:
             with get_local_path(osp.join(self.data_prefix['img'],
                                          img_name)) as local_path:
-                im = Image.open(local_path)
-                w, h = im.size
-                im.close()
+                im = cv2.imread(local_path)
+                h, w, _ = im.shape
         except:  # noqa: E722
+            print_log(
+                f'Failed to read image {img_name}.',
+                logger='current',
+                level=logging.DEBUG)
             return None
 
         img = {
@@ -293,9 +298,13 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         image_list = []
 
         for idx, frame_ids in enumerate(self.sequence_indices):
-            assert len(frame_ids) == (self.multiple_target
-                                      if self.multiple_target else
-                                      self.seq_len), f'{len(frame_ids)}'
+            expected_num_frames = self.seq_len
+            if self.multiple_target:
+                expected_num_frames = self.multiple_target
+
+            assert len(frame_ids) == (expected_num_frames), (
+                f'Expected `frame_ids` == {expected_num_frames}, but '
+                f'got {len(frame_ids)} ')
 
             _img_names = img_names[frame_ids]
 
diff --git a/mmpose/datasets/datasets/body3d/h36m_dataset.py b/mmpose/datasets/datasets/body3d/h36m_dataset.py
index b7a4f71d65..397738c276 100644
--- a/mmpose/datasets/datasets/body3d/h36m_dataset.py
+++ b/mmpose/datasets/datasets/body3d/h36m_dataset.py
@@ -151,7 +151,8 @@ def __init__(self,
         if factor_file:
             if not is_abs(factor_file):
                 factor_file = osp.join(data_root, factor_file)
-            assert exists(factor_file), 'Annotation file does not exist.'
+            assert exists(factor_file), (f'`factor_file`: {factor_file}'
+                                         'does not exist.')
         self.factor_file = factor_file
 
         if multiple_target > 0 and multiple_target_step == 0:
@@ -249,11 +250,19 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         kpts_3d = h36m_data['S']
 
         if self.keypoint_2d_src == 'detection':
-            assert exists(self.keypoint_2d_det_file)
+            assert exists(self.keypoint_2d_det_file), (
+                f'`keypoint_2d_det_file`: `{self.keypoint_2d_det_file}`'
+                'does not exist.')
             kpts_2d = self._load_keypoint_2d_detection(
                 self.keypoint_2d_det_file)
-            assert kpts_2d.shape[0] == kpts_3d.shape[0]
-            assert kpts_2d.shape[2] == 3
+            assert kpts_2d.shape[0] == kpts_3d.shape[0], (
+                f'Number of `kpts_2d` ({kpts_2d.shape[0]}) does not match '
+                f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
+
+            assert kpts_2d.shape[2] == 3, (
+                f'Expect `kpts_2d.shape[2]` == 3, but got '
+                f'{kpts_2d.shape[2]}. Please check the format of '
+                f'{self.keypoint_2d_det_file}')
 
             for idx, frame_ids in enumerate(self.sequence_indices):
                 kpt_2d = kpts_2d[frame_ids].astype(np.float32)
@@ -270,7 +279,10 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                 factors = np.load(local_path).astype(np.float32)
         else:
             factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
-        assert factors.shape[0] == kpts_3d.shape[0]
+        assert factors.shape[0] == kpts_3d.shape[0], (
+            f'Number of `factors` ({factors.shape[0]}) does not match '
+            f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
+
         for idx, frame_ids in enumerate(self.sequence_indices):
             factor = factors[frame_ids].astype(np.float32)
             instance_list[idx].update({'factor': factor})

From febc88c1f1536a5614616ff4704baded32af1980 Mon Sep 17 00:00:00 2001
From: Tau <taujiang@outlook.com>
Date: Mon, 31 Jul 2023 17:49:45 +0800
Subject: [PATCH 36/37] [Fix] Fix config typo in rtmpose-x (#2585)

---
 .../body_2d_keypoint/rtmpose-x_8xb256-700e_coco-384x288.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-x_8xb256-700e_coco-384x288.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-x_8xb256-700e_coco-384x288.py
index 1441e07791..25da9aeeb1 100644
--- a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-x_8xb256-700e_coco-384x288.py
+++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-x_8xb256-700e_coco-384x288.py
@@ -66,7 +66,7 @@
         arch='P5',
         expand_ratio=0.5,
         deepen_factor=1.33,
-        widen_factor=1.28,
+        widen_factor=1.25,
         out_indices=(4, ),
         channel_attention=True,
         norm_cfg=dict(type='SyncBN'),

From cc31a7c689f91dcc70055a74ad7bd7c20183d3fc Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Thu, 3 Aug 2023 16:10:50 +0800
Subject: [PATCH 37/37] fix problems

---
 demo/body3d_pose_lifter_demo.py | 15 +++++++++++++--
 mmpose/apis/inference_3d.py     | 30 +++++++++++++++++-------------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/demo/body3d_pose_lifter_demo.py b/demo/body3d_pose_lifter_demo.py
index d04fca9f3b..3c36d3a88b 100644
--- a/demo/body3d_pose_lifter_demo.py
+++ b/demo/body3d_pose_lifter_demo.py
@@ -90,7 +90,7 @@ def parse_args():
         '--save-predictions',
         action='store_true',
         default=False,
-        help='whether to save predicted results')
+        help='Whether to save predicted results')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
@@ -124,7 +124,14 @@ def parse_args():
         '--use-multi-frames',
         action='store_true',
         default=False,
-        help='whether to use multi frames for inference in the 2D pose'
+        help='Whether to use multi frames for inference in the 2D pose'
+        'detection stage. Default: False.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        default=False,
+        help='Inference mode. If set to True, can not use future frame'
+        'information when using multi frames for inference in the 2D pose'
         'detection stage. Default: False.')
 
     args = parser.parse_args()
@@ -405,6 +412,10 @@ def main():
         'Only "PoseLifter" model is supported for the 2nd stage ' \
         '(2D-to-3D lifting)'
 
+    if args.use_multi_frames:
+        assert 'frame_indices_test' in pose_estimator.cfg.data.test.data_cfg
+        indices = pose_estimator.cfg.data.test.data_cfg['frame_indices_test']
+
     pose_lifter.cfg.visualizer.radius = args.radius
     pose_lifter.cfg.visualizer.line_width = args.thickness
     pose_lifter.cfg.visualizer.det_kpt_color = det_kpt_color
diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py
index d4b9623b86..303cfd0713 100644
--- a/mmpose/apis/inference_3d.py
+++ b/mmpose/apis/inference_3d.py
@@ -181,16 +181,11 @@ def collate_pose_sequence(pose_results_2d,
     pose_sequences = []
     for idx in range(N):
         pose_seq = PoseDataSample()
-        gt_instances = InstanceData()
         pred_instances = InstanceData()
 
-        for k in pose_results_2d[target_frame][idx].gt_instances.keys():
-            gt_instances.set_field(
-                pose_results_2d[target_frame][idx].gt_instances[k], k)
-        for k in pose_results_2d[target_frame][idx].pred_instances.keys():
-            if k != 'keypoints':
-                pred_instances.set_field(
-                    pose_results_2d[target_frame][idx].pred_instances[k], k)
+        gt_instances = pose_results_2d[target_frame][idx].gt_instances.clone()
+        pred_instances = pose_results_2d[target_frame][
+            idx].pred_instances.clone()
         pose_seq.pred_instances = pred_instances
         pose_seq.gt_instances = gt_instances
 
@@ -228,7 +223,7 @@ def collate_pose_sequence(pose_results_2d,
                     # replicate the right most frame
                     keypoints[:, frame_idx + 1:] = keypoints[:, frame_idx]
                     break
-            pose_seq.pred_instances.keypoints = keypoints
+            pose_seq.pred_instances.set_field(keypoints, 'keypoints')
         pose_sequences.append(pose_seq)
 
     return pose_sequences
@@ -276,8 +271,15 @@ def inference_pose_lifter_model(model,
             bbox_center = None
             bbox_scale = None
 
+    pose_results_2d_copy = []
     for i, pose_res in enumerate(pose_results_2d):
+        pose_res_copy = []
         for j, data_sample in enumerate(pose_res):
+            data_sample_copy = PoseDataSample()
+            data_sample_copy.gt_instances = data_sample.gt_instances.clone()
+            data_sample_copy.pred_instances = data_sample.pred_instances.clone(
+            )
+            data_sample_copy.track_id = data_sample.track_id
             kpts = data_sample.pred_instances.keypoints
             bboxes = data_sample.pred_instances.bboxes
             keypoints = []
@@ -292,11 +294,13 @@ def inference_pose_lifter_model(model,
                                      bbox_scale + bbox_center)
                 else:
                     keypoints.append(kpt[:, :2])
-            pose_results_2d[i][j].pred_instances.keypoints = np.array(
-                keypoints)
+            data_sample_copy.pred_instances.set_field(
+                np.array(keypoints), 'keypoints')
+            pose_res_copy.append(data_sample_copy)
+        pose_results_2d_copy.append(pose_res_copy)
 
-    pose_sequences_2d = collate_pose_sequence(pose_results_2d, with_track_id,
-                                              target_idx)
+    pose_sequences_2d = collate_pose_sequence(pose_results_2d_copy,
+                                              with_track_id, target_idx)
 
     if not pose_sequences_2d:
         return []