From f534c76b53b465acc39fb7a902a66c635f708392 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 22 Aug 2023 15:06:39 +0800
Subject: [PATCH 01/21] --feat=add process script

---
 .../dataset_converters/ubody_smplx_to_coco.py | 419 ++++++++++++++++++
 1 file changed, 419 insertions(+)
 create mode 100644 tools/dataset_converters/ubody_smplx_to_coco.py

diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
new file mode 100644
index 0000000000..8bb796377f
--- /dev/null
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -0,0 +1,419 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os
+import os.path as osp
+from functools import partial
+from typing import Dict, List
+
+import mmengine
+import numpy as np
+import smplx
+import torch
+from pycocotools.coco import COCO
+
+
+class SMPLX(object):
+
+    def __init__(self, human_model_path):
+        self.human_model_path = human_model_path
+        self.layer_args = {
+            'create_global_orient': False,
+            'create_body_pose': False,
+            'create_left_hand_pose': False,
+            'create_right_hand_pose': False,
+            'create_jaw_pose': False,
+            'create_leye_pose': False,
+            'create_reye_pose': False,
+            'create_betas': False,
+            'create_expression': False,
+            'create_transl': False,
+        }
+
+        self.neutral_model = smplx.create(
+            self.human_model_path,
+            'smplx',
+            gender='NEUTRAL',
+            use_pca=False,
+            use_face_contour=True,
+            **self.layer_args)
+
+        self.vertex_num = 10475
+        self.face = self.neutral_model.faces
+        self.shape_param_dim = 10
+        self.expr_code_dim = 10
+        # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint)
+        self.orig_joint_num = 53
+
+        # yapf: disable
+        self.orig_joints_name = (
+            # 22 body joints
+            'Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee',
+            'Spine2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot',
+            'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder',
+            'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist',
+            # left hand joints
+            'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2',
+            'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1',
+            'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3',
+            # right hand joints
+            'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2',
+            'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1',
+            'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3',
+            # 1 face jaw joint
+            'Jaw',
+        )
+        self.orig_flip_pairs = (
+            # body joints
+            (1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19),
+            (20, 21),
+            # hand joints
+            (22, 37), (23, 38), (24, 39), (25, 40), (26, 41), (27, 42),
+            (28, 43), (29, 44), (30, 45), (31, 46), (32, 47), (33, 48),
+            (34, 49), (35, 50), (36, 51),
+        )
+        # yapf: enable
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_part = {
+            'body':
+            range(
+                self.orig_joints_name.index('Pelvis'),
+                self.orig_joints_name.index('R_Wrist') + 1),
+            'lhand':
+            range(
+                self.orig_joints_name.index('L_Index_1'),
+                self.orig_joints_name.index('L_Thumb_3') + 1),
+            'rhand':
+            range(
+                self.orig_joints_name.index('R_Index_1'),
+                self.orig_joints_name.index('R_Thumb_3') + 1),
+            'face':
+            range(
+                self.orig_joints_name.index('Jaw'),
+                self.orig_joints_name.index('Jaw') + 1)
+        }
+
+        # changed SMPLX joint set for the supervision
+        self.joint_num = (
+            137  # 25 (body joints) + 40 (hand joints) + 72 (face keypoints)
+        )
+        # yapf: disable
+        self.joints_name = (
+            # 25 body joints
+            'Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle',
+            'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow',
+            'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe',
+            'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear',
+            'L_Eye', 'R_Eye', 'Nose',
+            # left hand joints
+            'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb4', 'L_Index_1',
+            'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2',
+            'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3',
+            'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4',
+            # right hand joints
+            'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1',
+            'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2',
+            'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3',
+            'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4',
+            # 72 face keypoints
+            *[
+                f'Face_{i}' for i in range(1, 73)
+            ],
+        )
+
+        self.root_joint_idx = self.joints_name.index('Pelvis')
+        self.lwrist_idx = self.joints_name.index('L_Wrist')
+        self.rwrist_idx = self.joints_name.index('R_Wrist')
+        self.neck_idx = self.joints_name.index('Neck')
+        self.flip_pairs = (
+            # body joints
+            (1, 2), (3, 4), (5, 6), (8, 9), (10, 11), (12, 13), (14, 17),
+            (15, 18), (16, 19), (20, 21), (22, 23),
+            # hand joints
+            (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), (30, 50),
+            (31, 51), (32, 52), (33, 53), (34, 54), (35, 55), (36, 56),
+            (37, 57), (38, 58), (39, 59), (40, 60), (41, 61), (42, 62),
+            (43, 63), (44, 64),
+            # face eyebrow
+            (67, 68), (69, 78), (70, 77), (71, 76), (72, 75), (73, 74),
+            # face below nose
+            (83, 87), (84, 86),
+            # face eyes
+            (88, 97), (89, 96), (90, 95), (91, 94), (92, 99), (93, 98),
+            # face mouse
+            (100, 106), (101, 105), (102, 104), (107, 111), (108, 110),
+            # face lip
+            (112, 116), (113, 115), (117, 119),
+            # face contours
+            (120, 136), (121, 135), (122, 134), (123, 133), (124, 132),
+            (125, 131), (126, 130), (127, 129)
+        )
+        self.joint_idx = (
+            0, 1, 2, 4, 5, 7, 8, 12, 16, 17, 18, 19, 20, 21, 60, 61, 62, 63,
+            64, 65, 59, 58, 57, 56, 55,  # body joints
+            37, 38, 39, 66, 25, 26, 27, 67, 28, 29, 30, 68, 34, 35, 36, 69, 31,
+            32, 33, 70,  # left hand joints
+            52, 53, 54, 71, 40, 41, 42, 72, 43, 44, 45, 73, 49, 50, 51, 74, 46,
+            47, 48, 75,  # right hand joints
+            22, 15,  # jaw, head
+            57, 56,  # eyeballs
+            76, 77, 78, 79, 80, 81, 82, 83, 84, 85,  # eyebrow
+            86, 87, 88, 89,  # nose
+            90, 91, 92, 93, 94,  # below nose
+            95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,  # eyes
+            107,  # right mouth
+            108, 109, 110, 111, 112,  # upper mouth
+            113,  # left mouth
+            114, 115, 116, 117, 118,  # lower mouth
+            119,  # right lip
+            120, 121, 122,  # upper lip
+            123,  # left lip
+            124, 125, 126,  # lower lip
+            127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+            140, 141, 142, 143,  # face contour
+        )
+        # yapf: enable
+
+        self.joint_part = {
+            'body':
+            range(
+                self.joints_name.index('Pelvis'),
+                self.joints_name.index('Nose') + 1),
+            'lhand':
+            range(
+                self.joints_name.index('L_Thumb_1'),
+                self.joints_name.index('L_Pinky_4') + 1),
+            'rhand':
+            range(
+                self.joints_name.index('R_Thumb_1'),
+                self.joints_name.index('R_Pinky_4') + 1),
+            'hand':
+            range(
+                self.joints_name.index('L_Thumb_1'),
+                self.joints_name.index('R_Pinky_4') + 1),
+            'face':
+            range(
+                self.joints_name.index('Face_1'),
+                self.joints_name.index('Face_72') + 1)
+        }
+
+
+def read_annotation_file(annotation_file: str) -> List[Dict]:
+    with open(annotation_file, 'r') as f:
+        annotations = json.load(f)
+    return annotations
+
+
+def cam2pixel(cam_coord, f, c):
+    x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
+    y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
+    z = cam_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
+                       human_model: SMPLX):
+    annos = read_annotation_file(
+        osp.join(annotation_root, scene, 'smplx_annotation.json'))
+    keypoint_annos = COCO(
+        osp.join(annotation_root, scene, 'keypoint_annotation.json'))
+
+    train_annos = []
+    val_annos = []
+    train_imgs = []
+    val_imgs = []
+
+    progress_bar = mmengine.ProgressBar(len(keypoint_annos.anns.keys()))
+    for aid in keypoint_annos.anns.keys():
+        ann = keypoint_annos.anns[aid]
+        img = keypoint_annos.loadImgs(ann['image_id'])[0]
+        if img['file_name'].startswith('/'):
+            file_name = img['file_name'][1:]
+        else:
+            file_name = img['file_name']
+
+        video_name = file_name.split('/')[-2]
+        if 'Trim' in video_name:
+            video_name = video_name.split('_Trim')[0]
+
+        img_path = os.path.join(
+            annotation_root.replace('annotations', 'images'), scene, file_name)
+        if not os.path.exists(img_path):
+            progress_bar.update()
+            continue
+
+        smplx_param = annos[str(aid)]
+        human_model_param = smplx_param['smplx_param']
+        cam_param = smplx_param['cam_param']
+        if 'lhand_valid' not in human_model_param:
+            human_model_param['lhand_valid'] = ann['lefthand_valid']
+            human_model_param['rhand_valid'] = ann['righthand_valid']
+            human_model_param['face_valid'] = ann['face_valid']
+
+        rotation_valid = np.ones((human_model.orig_joint_num),
+                                 dtype=np.float32)
+        coord_valid = np.ones((human_model.joint_num), dtype=np.float32)
+
+        root_pose, body_pose, shape, trans = (human_model_param['root_pose'],
+                                              human_model_param['body_pose'],
+                                              human_model_param['shape'],
+                                              human_model_param['trans'])
+
+        if 'lhand_pose' in human_model_param and human_model_param.get(
+                'lhand_valid', False):
+            lhand_pose = human_model_param['lhand_pose']
+        else:
+            lhand_pose = np.zeros(
+                (3 * len(human_model.orig_joint_part['lhand'])),
+                dtype=np.float32)
+            rotation_valid[human_model.orig_joint_part['lhand']] = 0
+            coord_valid[human_model.orig_joint_part['lhand']] = 0
+
+        if 'rhand_pose' in human_model_param and human_model_param.get(
+                'rhand_valid', False):
+            rhand_pose = human_model_param['rhand_pose']
+        else:
+            rhand_pose = np.zeros(
+                (3 * len(human_model.orig_joint_part['rhand'])),
+                dtype=np.float32)
+            rotation_valid[human_model.orig_joint_part['rhand']] = 0
+            coord_valid[human_model.orig_joint_part['rhand']] = 0
+
+        if 'jaw_pose' in human_model_param and \
+            'expr' in human_model_param and \
+                human_model_param.get('face_valid', False):
+            jaw_pose = human_model_param['jaw_pose']
+            expr = human_model_param['expr']
+        else:
+            jaw_pose = np.zeros((3), dtype=np.float32)
+            expr = np.zeros((human_model.expr_code_dim), dtype=np.float32)
+            rotation_valid[human_model.orig_joint_part['face']] = 0
+            coord_valid[human_model.orig_joint_part['face']] = 0
+
+        # init human model inputs
+        root_pose = torch.FloatTensor(root_pose).view(1, 3)
+        body_pose = torch.FloatTensor(body_pose).view(-1, 3)
+        lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3)
+        rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3)
+        jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3)
+        shape = torch.FloatTensor(shape).view(1, -1)
+        expr = torch.FloatTensor(expr).view(1, -1)
+        trans = torch.FloatTensor(trans).view(1, -1)
+        zero_pose = torch.zeros((1, 3), dtype=torch.float32)
+        with torch.no_grad():
+            output = human_model.neutral_model(
+                betas=shape,
+                body_pose=body_pose.view(1, -1),
+                global_orient=root_pose,
+                transl=trans,
+                left_hand_pose=lhand_pose.view(1, -1),
+                right_hand_pose=rhand_pose.view(1, -1),
+                jaw_pose=jaw_pose.view(1, -1),
+                leye_pose=zero_pose,
+                reye_pose=zero_pose,
+                expression=expr)
+
+        joint_cam = output.joints[0].numpy()[human_model.joint_idx, :]
+        joint_img = cam2pixel(joint_cam, cam_param['focal'],
+                              cam_param['princpt'])
+
+        joint_cam = (joint_cam - joint_cam[human_model.root_joint_idx, None, :]
+                     )  # root-relative
+        joint_cam[human_model.joint_part['lhand'], :] = (
+            joint_cam[human_model.joint_part['lhand'], :] -
+            joint_cam[human_model.lwrist_idx, None, :]
+        )  # left hand root-relative
+        joint_cam[human_model.joint_part['rhand'], :] = (
+            joint_cam[human_model.joint_part['rhand'], :] -
+            joint_cam[human_model.rwrist_idx, None, :]
+        )  # right hand root-relative
+        joint_cam[human_model.joint_part['face'], :] = (
+            joint_cam[human_model.joint_part['face'], :] -
+            joint_cam[human_model.neck_idx, None, :])  # face root-relative
+
+        body_3d_size = 2
+        output_hm_shape = (16, 16, 12)
+        joint_img[human_model.joint_part['body'],
+                  2] = ((joint_cam[human_model.joint_part['body'], 2].copy() /
+                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
+        joint_img[human_model.joint_part['lhand'],
+                  2] = ((joint_cam[human_model.joint_part['lhand'], 2].copy() /
+                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
+        joint_img[human_model.joint_part['rhand'],
+                  2] = ((joint_cam[human_model.joint_part['rhand'], 2].copy() /
+                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
+        joint_img[human_model.joint_part['face'],
+                  2] = ((joint_cam[human_model.joint_part['face'], 2].copy() /
+                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
+
+        keypoints_2d = joint_img[:, :2].copy()
+        ann_3d = {
+            **ann,
+            'keypoints_3d': joint_cam.tolist(),
+        }
+        ann_3d['keypoints'] = keypoints_2d.tolist()
+
+        img['file_name'] = os.path.join(scene, file_name)
+        if video_name in splits:
+            val_annos.append(ann_3d)
+            val_imgs.append(img)
+        else:
+            train_annos.append(ann_3d)
+            train_imgs.append(img)
+        progress_bar.update()
+
+    categoreis = [{
+        'supercategory': 'person',
+        'id': 1,
+        'name': 'person',
+        'keypoints': human_model.joints_name,
+        'skeleton': human_model.flip_pairs
+    }]
+    train_data = {
+        'images': train_imgs,
+        'annotations': train_annos,
+        'categories': categoreis
+    }
+    val_data = {
+        'images': val_imgs,
+        'annotations': val_annos,
+        'categories': categoreis
+    }
+
+    mmengine.dump(
+        train_data,
+        osp.join(annotation_root, scene, 'train_3dkeypoint_annotation.json'))
+    mmengine.dump(
+        val_data,
+        osp.join(annotation_root, scene, 'val_3dkeypoint_annotation.json'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data-root', type=str, default='data/UBody')
+    parser.add_argument('--human-model-path', type=str, default='data/SMPLX')
+    parser.add_argument(
+        '--nproc', default=8, type=int, help='number of process')
+    args = parser.parse_args()
+
+    split_path = f'{args.data_root}/splits/intra_scene_test_list.npy'
+    annotation_path = f'{args.data_root}/annotations'
+
+    folders = os.listdir(annotation_path)
+    human_model = SMPLX(args.human_model_path)
+    splits = np.load(split_path)
+
+    if args.nproc > 1:
+        mmengine.track_parallel_progress(
+            partial(
+                process_scene_anno,
+                annotation_root=annotation_path,
+                splits=splits,
+                human_model=human_model), folders, args.nproc)
+    else:
+        mmengine.track_progress(
+            partial(
+                process_scene_anno,
+                annotation_root=annotation_path,
+                splits=splits,
+                human_model=human_model), folders)

From 8bb1b8f6b74805caa3465e8af39add258dbdd48d Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 22 Aug 2023 15:42:41 +0800
Subject: [PATCH 02/21] --other=add dataset to registry

---
 configs/_base_/datasets/ubody3d.py            |  12 +
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 168 ++++++++++
 mmpose/datasets/datasets/body3d/__init__.py   |   3 +-
 .../datasets/body3d/ubody3d_dataset.py        | 302 ++++++++++++++++++
 4 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 configs/_base_/datasets/ubody3d.py
 create mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
 create mode 100644 mmpose/datasets/datasets/body3d/ubody3d_dataset.py

diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py
new file mode 100644
index 0000000000..8a40e9ffcf
--- /dev/null
+++ b/configs/_base_/datasets/ubody3d.py
@@ -0,0 +1,12 @@
+dataset_info = dict(
+    dataset_name='ubody3d',
+    paper_info=dict(
+        author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li',
+        title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware'
+        'Transformer',
+        container='IEEE Computer Society Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2023',
+        homepage='https://github.com/IDEA-Research/OSX',
+    ),
+    keypoint_info={})
diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
new file mode 100644
index 0000000000..b3c1c2db80
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
@@ -0,0 +1,168 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=200, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
+
+# learning policy
+param_scheduler = [
+    dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1))
+
+# codec settings
+# 3D keypoint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl'
+target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04],
+               [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02],
+               [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01],
+               [2.55651315e-04, 7.11954606e-03, 9.81423866e-04],
+               [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02],
+               [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01],
+               [2.90583676e-03, -2.11363307e-01, -4.74210915e-02],
+               [5.67537804e-03, -4.35088906e-01, -9.76974016e-02],
+               [5.93884964e-03, -4.91891970e-01, -1.10666618e-01],
+               [7.37352083e-03, -5.83948619e-01, -1.31171400e-01],
+               [5.41920653e-03, -3.83931702e-01, -8.68145417e-02],
+               [2.95964662e-03, -1.87567488e-01, -4.34536934e-02],
+               [1.26585822e-03, -1.20170579e-01, -2.82526049e-02],
+               [4.67186639e-03, -3.83644089e-01, -8.55125784e-02],
+               [1.67648571e-03, -1.97007177e-01, -4.31368364e-02],
+               [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]],
+target_std = [[0.11072244, 0.02238818, 0.07246294],
+              [0.15856311, 0.18933832, 0.20880479],
+              [0.19179935, 0.24320062, 0.24756193],
+              [0.11072181, 0.02238805, 0.07246253],
+              [0.15880454, 0.19977188, 0.2147063],
+              [0.18001944, 0.25052739, 0.24853247],
+              [0.05210694, 0.05211406, 0.06908241],
+              [0.09515367, 0.10133032, 0.12899733],
+              [0.11742458, 0.12648469, 0.16465091],
+              [0.12360297, 0.13085539, 0.16433336],
+              [0.14602232, 0.09707956, 0.13952731],
+              [0.24347532, 0.12982249, 0.20230181],
+              [0.2446877, 0.21501816, 0.23938235],
+              [0.13876084, 0.1008926, 0.1424411],
+              [0.23687529, 0.14491219, 0.20980829],
+              [0.24400695, 0.23975028, 0.25520584]]
+# 2D keypoint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl'
+keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141],
+                  [530.68456967, 493.54259285], [529.36968722, 575.96448516],
+                  [532.29767646, 421.28483336], [531.93946631, 494.72186795],
+                  [529.71984447, 578.96110365], [532.93699382, 370.65225054],
+                  [534.1101856, 317.90342311], [534.55416813, 304.24143901],
+                  [534.86955004, 282.31030885], [534.11308566, 330.11296796],
+                  [533.53637525, 376.2742511], [533.49380107, 391.72324565],
+                  [533.52579142, 330.09494668], [532.50804964, 374.190479],
+                  [532.72786934, 380.61615716]],
+keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443],
+                 [119.12412107, 50.53806215], [120.61688045, 56.38444891],
+                 [101.95735275, 62.89636486], [106.24832897, 48.41178119],
+                 [108.46734966, 54.58177071], [109.07369806, 68.70443672],
+                 [111.20130351, 74.87287863], [111.63203838, 77.80542514],
+                 [113.22330788, 79.90670556], [105.7145833, 73.27049436],
+                 [107.05804267, 73.93175781], [107.97449418, 83.30391802],
+                 [121.60675105, 74.25691526], [134.34378973, 77.48125087],
+                 [131.79990652, 89.86721124]]
+codec = dict(
+    type='ImagePoseLifting',
+    num_keypoints=17,
+    root_index=0,
+    remove_root=True,
+    target_mean=target_mean,
+    target_std=target_std,
+    keypoints_mean=keypoints_mean,
+    keypoints_std=keypoints_std)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='TCN',
+        in_channels=2 * 17,
+        stem_channels=1024,
+        num_blocks=2,
+        kernel_sizes=(1, 1, 1),
+        dropout=0.5,
+    ),
+    head=dict(
+        type='TemporalRegressionHead',
+        in_channels=1024,
+        num_joints=16,
+        loss=dict(type='MSELoss'),
+        decoder=codec,
+    ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'target_root', 'target_root_index', 'target_mean',
+                   'target_std'))
+]
+val_pipeline = train_pipeline
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train.npz',
+        seq_len=1,
+        causal=True,
+        keypoint_2d_src='gt',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test.npz',
+        seq_len=1,
+        causal=True,
+        keypoint_2d_src='gt',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe'),
+    dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/mmpose/datasets/datasets/body3d/__init__.py b/mmpose/datasets/datasets/body3d/__init__.py
index d5afeca578..2b52caeadd 100644
--- a/mmpose/datasets/datasets/body3d/__init__.py
+++ b/mmpose/datasets/datasets/body3d/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .h36m_dataset import Human36mDataset
+from .ubody3d_dataset import UBody3dDataset
 
-__all__ = ['Human36mDataset']
+__all__ = ['Human36mDataset', 'UBody3dDataset']
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
new file mode 100644
index 0000000000..a7aca146ca
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import defaultdict
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from mmengine.fileio import exists, get_local_path
+from mmengine.utils import is_abs
+
+from mmpose.datasets.datasets import BaseMocapDataset
+from mmpose.registry import DATASETS
+
+
+@DATASETS.register_module()
+class UBody3dDataset(BaseMocapDataset):
+    """Ubody3d dataset for 3D human pose estimation.
+
+    "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer",
+    CVPR'2023. More details can be found in the `paper
+    <https://arxiv.org/abs/2303.16160>`__ .
+
+    Ubody3D keypoints::
+
+        0-24: 25 body keypoints,
+        25-64: 40 hand keypoints,
+        65-136: 72 face keypoints,
+
+        In total, we have 137 keypoints for wholebody 3D pose estimation.
+
+    Args:
+        ann_file (str): Annotation file path. Default: ''.
+        seq_len (int): Number of frames in a sequence. Default: 1.
+        seq_step (int): The interval for extracting frames from the video.
+            Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
+        multiple_target_step (int): The interval for merging sequence. Only
+            valid when ``multiple_target`` is larger than 0. Default: 0.
+        pad_video_seq (bool): Whether to pad the video so that poses will be
+            predicted for every frame in the video. Default: ``False``.
+        causal (bool): If set to ``True``, the rightmost input frame will be
+            the target frame. Otherwise, the middle input frame will be the
+            target frame. Default: ``True``.
+        subset_frac (float): The fraction to reduce dataset size. If set to 1,
+            the dataset size is not reduced. Default: 1.
+        keypoint_2d_src (str): Specifies 2D keypoint information options, which
+            should be one of the following options:
+
+            - ``'gt'``: load from the annotation file
+            - ``'detection'``: load from a detection
+              result file of 2D keypoint
+            - 'pipeline': the information will be generated by the pipeline
+
+            Default: ``'gt'``.
+        keypoint_2d_det_file (str, optional): The 2D keypoint detection file.
+            If set, 2d keypoint loaded from this file will be used instead of
+            ground-truth keypoints. This setting is only when
+            ``keypoint_2d_src`` is ``'detection'``. Default: ``None``.
+        factor_file (str, optional): The projection factors' file. If set,
+            factor loaded from this file will be used instead of calculated
+            factors. Default: ``None``.
+        camera_param_file (str): Cameras' parameters file. Default: ``None``.
+        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
+            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
+            one instance; while in ``'bottomup'`` mode, each data sample
+            contains all instances in a image. Default: ``'topdown'``
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Default: ``None``.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Default: ``None``.
+        data_prefix (dict, optional): Prefix for training data.
+            Default: ``dict(img='')``.
+        filter_cfg (dict, optional): Config for filter data. Default: `None`.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Default: ``None`` which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy.
+            Default: ``True``.
+        pipeline (list, optional): Processing pipeline. Default: [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Default: ``False``.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Default: ``False``.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Default: 1000.
+    """
+
+    METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py')
+    SUPPORTED_keypoint_2d_src = {'gt', 'detection', 'pipeline'}
+
+    def __init__(self,
+                 ann_file: str = '',
+                 seq_len: int = 1,
+                 seq_step: int = 1,
+                 multiple_target: int = 0,
+                 multiple_target_step: int = 0,
+                 pad_video_seq: bool = False,
+                 causal: bool = True,
+                 subset_frac: float = 1.0,
+                 keypoint_2d_src: str = 'gt',
+                 keypoint_2d_det_file: Optional[str] = None,
+                 factor_file: Optional[str] = None,
+                 camera_param_file: Optional[str] = None,
+                 data_mode: str = 'topdown',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000):
+        # check keypoint_2d_src
+        self.keypoint_2d_src = keypoint_2d_src
+        if self.keypoint_2d_src not in self.SUPPORTED_keypoint_2d_src:
+            raise ValueError(
+                f'Unsupported `keypoint_2d_src` "{self.keypoint_2d_src}". '
+                f'Supported options are {self.SUPPORTED_keypoint_2d_src}')
+
+        if keypoint_2d_det_file:
+            if not is_abs(keypoint_2d_det_file):
+                self.keypoint_2d_det_file = osp.join(data_root,
+                                                     keypoint_2d_det_file)
+            else:
+                self.keypoint_2d_det_file = keypoint_2d_det_file
+
+        self.seq_step = seq_step
+        self.pad_video_seq = pad_video_seq
+
+        if factor_file:
+            if not is_abs(factor_file):
+                factor_file = osp.join(data_root, factor_file)
+            assert exists(factor_file), (f'`factor_file`: {factor_file}'
+                                         'does not exist.')
+        self.factor_file = factor_file
+
+        if multiple_target > 0 and multiple_target_step == 0:
+            multiple_target_step = multiple_target
+        self.multiple_target_step = multiple_target_step
+
+        super().__init__(
+            ann_file=ann_file,
+            seq_len=seq_len,
+            multiple_target=multiple_target,
+            causal=causal,
+            subset_frac=subset_frac,
+            camera_param_file=camera_param_file,
+            data_mode=data_mode,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch)
+
+    def get_sequence_indices(self) -> List[List[int]]:
+        """Split original videos into sequences and build frame indices.
+
+        This method overrides the default one in the base class.
+        """
+        imgnames = self.ann_data['imgname']
+        video_frames = defaultdict(list)
+        for idx, imgname in enumerate(imgnames):
+            subj, action, camera = self._parse_h36m_imgname(imgname)
+            video_frames[(subj, action, camera)].append(idx)
+
+        # build sample indices
+        sequence_indices = []
+        _len = (self.seq_len - 1) * self.seq_step + 1
+        _step = self.seq_step
+
+        if self.multiple_target:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
+                seqs_from_video = [
+                    _indices[i:(i + self.multiple_target):_step]
+                    for i in range(0, n_frame, self.multiple_target_step)
+                ][:(n_frame + self.multiple_target_step -
+                    self.multiple_target) // self.multiple_target_step]
+                sequence_indices.extend(seqs_from_video)
+
+        else:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
+
+                if self.pad_video_seq:
+                    # Pad the sequence so that every frame in the sequence will
+                    # be predicted.
+                    if self.causal:
+                        frames_left = self.seq_len - 1
+                        frames_right = 0
+                    else:
+                        frames_left = (self.seq_len - 1) // 2
+                        frames_right = frames_left
+                    for i in range(n_frame):
+                        pad_left = max(0, frames_left - i // _step)
+                        pad_right = max(
+                            0, frames_right - (n_frame - 1 - i) // _step)
+                        start = max(i % _step, i - frames_left * _step)
+                        end = min(n_frame - (n_frame - 1 - i) % _step,
+                                  i + frames_right * _step + 1)
+                        sequence_indices.append([_indices[0]] * pad_left +
+                                                _indices[start:end:_step] +
+                                                [_indices[-1]] * pad_right)
+                else:
+                    seqs_from_video = [
+                        _indices[i:(i + _len):_step]
+                        for i in range(0, n_frame - _len + 1)
+                    ]
+                    sequence_indices.extend(seqs_from_video)
+
+        # reduce dataset size if needed
+        subset_size = int(len(sequence_indices) * self.subset_frac)
+        start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
+        end = start + subset_size
+
+        sequence_indices = sequence_indices[start:end]
+
+        return sequence_indices
+
+    def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
+        instance_list, image_list = super()._load_annotations()
+
+        h36m_data = self.ann_data
+        kpts_3d = h36m_data['S']
+
+        if self.keypoint_2d_src == 'detection':
+            assert exists(self.keypoint_2d_det_file), (
+                f'`keypoint_2d_det_file`: `{self.keypoint_2d_det_file}`'
+                'does not exist.')
+            kpts_2d = self._load_keypoint_2d_detection(
+                self.keypoint_2d_det_file)
+            assert kpts_2d.shape[0] == kpts_3d.shape[0], (
+                f'Number of `kpts_2d` ({kpts_2d.shape[0]}) does not match '
+                f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
+
+            assert kpts_2d.shape[2] == 3, (
+                f'Expect `kpts_2d.shape[2]` == 3, but got '
+                f'{kpts_2d.shape[2]}. Please check the format of '
+                f'{self.keypoint_2d_det_file}')
+
+            for idx, frame_ids in enumerate(self.sequence_indices):
+                kpt_2d = kpts_2d[frame_ids].astype(np.float32)
+                keypoints = kpt_2d[..., :2]
+                keypoints_visible = kpt_2d[..., 2]
+                instance_list[idx].update({
+                    'keypoints':
+                    keypoints,
+                    'keypoints_visible':
+                    keypoints_visible
+                })
+        if self.factor_file:
+            with get_local_path(self.factor_file) as local_path:
+                factors = np.load(local_path).astype(np.float32)
+        else:
+            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
+        assert factors.shape[0] == kpts_3d.shape[0], (
+            f'Number of `factors` ({factors.shape[0]}) does not match '
+            f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
+
+        for idx, frame_ids in enumerate(self.sequence_indices):
+            factor = factors[frame_ids].astype(np.float32)
+            instance_list[idx].update({'factor': factor})
+
+        return instance_list, image_list
+
+    @staticmethod
+    def _parse_h36m_imgname(imgname) -> Tuple[str, str, str]:
+        """Parse imgname to get information of subject, action and camera.
+
+        A typical h36m image filename is like:
+        S1_Directions_1.54138969_000001.jpg
+        """
+        subj, rest = osp.basename(imgname).split('_', 1)
+        action, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+        return subj, action, camera
+
+    def get_camera_param(self, imgname) -> dict:
+        """Get camera parameters of a frame by its image name."""
+        assert hasattr(self, 'camera_param')
+        subj, _, camera = self._parse_h36m_imgname(imgname)
+        return self.camera_param[(subj, camera)]
+
+    def _load_keypoint_2d_detection(self, det_file):
+        """"Load 2D joint detection results from file."""
+        with get_local_path(det_file) as local_path:
+            kpts_2d = np.load(local_path).astype(np.float32)
+
+        return kpts_2d

From 225fded8270f19b6cc25246d8e55cc2762e4cca5 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 22 Aug 2023 16:22:40 +0800
Subject: [PATCH 03/21] --ohter=update config

---
 configs/wholebody_3d_keypoint/README.md       |  0
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 62 +++++++++++++------
 2 files changed, 43 insertions(+), 19 deletions(-)
 create mode 100644 configs/wholebody_3d_keypoint/README.md
 rename configs/{body_3d_keypoint/pose_lift => wholebody_3d_keypoint}/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py (83%)

diff --git a/configs/wholebody_3d_keypoint/README.md b/configs/wholebody_3d_keypoint/README.md
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
similarity index 83%
rename from configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
rename to configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
index b3c1c2db80..132038ba93 100644
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
+++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
@@ -112,8 +112,39 @@
     ))
 
 # base dataset settings
-dataset_type = 'Human36mDataset'
-data_root = 'data/h36m/'
+dataset_type = 'UBody3dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = []
+val_datasets = []
+
+for scene in scenes:
+    train_dataset = dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json',
+        seq_len=1,
+        causal=True,
+        keypoint_2d_src='gt',
+        data_prefix=dict(img='images/'),
+        pipeline=[])
+    val_dataset = dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json',
+        data_prefix=dict(img='images/'),
+        pipeline=[])
+    train_datasets.append(train_dataset)
+    val_datasets.append(val_dataset)
 
 # pipelines
 train_pipeline = [
@@ -133,30 +164,23 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type,
-        ann_file='annotation_body3d/fps50/h36m_train.npz',
-        seq_len=1,
-        causal=True,
-        keypoint_2d_src='gt',
-        data_root=data_root,
-        data_prefix=dict(img='images/'),
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
+        datasets=train_datasets,
         pipeline=train_pipeline,
+        test_mode=False,
     ))
 val_dataloader = dict(
     batch_size=64,
     num_workers=2,
     persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type,
-        ann_file='annotation_body3d/fps50/h36m_test.npz',
-        seq_len=1,
-        causal=True,
-        keypoint_2d_src='gt',
-        data_root=data_root,
-        data_prefix=dict(img='images/'),
-        pipeline=train_pipeline,
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
+        datasets=val_datasets,
+        pipeline=val_pipeline,
+        test_mode=True,
     ))
 test_dataloader = val_dataloader
 

From 828799da3ba3e26d6c24993b8c3bd0a1dec186e2 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 22 Aug 2023 17:00:08 +0800
Subject: [PATCH 04/21] --other=add metainfo

---
 configs/_base_/datasets/ubody3d.py | 930 ++++++++++++++++++++++++++++-
 1 file changed, 929 insertions(+), 1 deletion(-)

diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py
index 8a40e9ffcf..a971ef4614 100644
--- a/configs/_base_/datasets/ubody3d.py
+++ b/configs/_base_/datasets/ubody3d.py
@@ -9,4 +9,932 @@
         year='2023',
         homepage='https://github.com/IDEA-Research/OSX',
     ),
-    keypoint_info={})
+    keypoint_info={
+        0:
+        dict(name='Pelvis', id=0, color=[0, 255, 0], type='', swap=''),
+        1:
+        dict(
+            name='L_Hip', id=1, color=[0, 255, 0], type='lower', swap='R_Hip'),
+        2:
+        dict(
+            name='R_Hip', id=2, color=[0, 255, 0], type='lower', swap='L_Hip'),
+        3:
+        dict(
+            name='L_Knee',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_Knee'),
+        4:
+        dict(
+            name='R_Knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        5:
+        dict(
+            name='L_Ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_Ankle'),
+        6:
+        dict(
+            name='R_Ankle',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Ankle'),
+        7:
+        dict(name='Neck', id=7, color=[0, 255, 0], type='upper', swap=''),
+        8:
+        dict(
+            name='L_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_Shoulder'),
+        9:
+        dict(
+            name='R_Shoulder',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        10:
+        dict(
+            name='L_Elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_Elbow'),
+        11:
+        dict(
+            name='R_Elbow',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Elbow'),
+        12:
+        dict(
+            name='L_Wrist',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_Wrist'),
+        13:
+        dict(
+            name='R_Wrist',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Wrist'),
+        14:
+        dict(
+            name='L_Big_toe',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_Big_toe'),
+        15:
+        dict(
+            name='L_Small_toe',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_Small_toe'),
+        16:
+        dict(
+            name='L_Heel',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_Heel'),
+        17:
+        dict(
+            name='R_Big_toe',
+            id=17,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Big_toe'),
+        18:
+        dict(
+            name='R_Small_toe',
+            id=18,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Small_toe'),
+        19:
+        dict(
+            name='R_Heel',
+            id=19,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Heel'),
+        20:
+        dict(
+            name='L_Ear', id=20, color=[0, 255, 0], type='upper',
+            swap='R_Ear'),
+        21:
+        dict(
+            name='R_Ear', id=21, color=[0, 255, 0], type='upper',
+            swap='L_Ear'),
+        22:
+        dict(name='L_Eye', id=22, color=[0, 255, 0], type='', swap='R_Eye'),
+        23:
+        dict(name='R_Eye', id=23, color=[0, 255, 0], type='', swap='L_Eye'),
+        24:
+        dict(name='Nose', id=24, color=[0, 255, 0], type='upper', swap=''),
+        25:
+        dict(
+            name='L_Thumb_1',
+            id=25,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Thumb_1'),
+        26:
+        dict(
+            name='L_Thumb_2',
+            id=26,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Thumb_2'),
+        27:
+        dict(
+            name='L_Thumb_3',
+            id=27,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Thumb_3'),
+        28:
+        dict(
+            name='L_Thumb4',
+            id=28,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Thumb_4'),
+        29:
+        dict(
+            name='L_Index_1',
+            id=29,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Index_1'),
+        30:
+        dict(
+            name='L_Index_2',
+            id=30,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Index_2'),
+        31:
+        dict(
+            name='L_Index_3',
+            id=31,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Index_3'),
+        32:
+        dict(
+            name='L_Index_4',
+            id=32,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Index_4'),
+        33:
+        dict(
+            name='L_Middle_1',
+            id=33,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Middle_1'),
+        34:
+        dict(
+            name='L_Middle_2',
+            id=34,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Middle_2'),
+        35:
+        dict(
+            name='L_Middle_3',
+            id=35,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Middle_3'),
+        36:
+        dict(
+            name='L_Middle_4',
+            id=36,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Middle_4'),
+        37:
+        dict(
+            name='L_Ring_1',
+            id=37,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Ring_1'),
+        38:
+        dict(
+            name='L_Ring_2',
+            id=38,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Ring_2'),
+        39:
+        dict(
+            name='L_Ring_3',
+            id=39,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Ring_3'),
+        40:
+        dict(
+            name='L_Ring_4',
+            id=40,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Ring_4'),
+        41:
+        dict(
+            name='L_Pinky_1',
+            id=41,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Pinky_1'),
+        42:
+        dict(
+            name='L_Pinky_2',
+            id=42,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Pinky_2'),
+        43:
+        dict(
+            name='L_Pinky_3',
+            id=43,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Pinky_3'),
+        44:
+        dict(
+            name='L_Pinky_4',
+            id=44,
+            color=[255, 128, 0],
+            type='',
+            swap='R_Pinky_4'),
+        45:
+        dict(
+            name='R_Thumb_1',
+            id=45,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Thumb_1'),
+        46:
+        dict(
+            name='R_Thumb_2',
+            id=46,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Thumb_2'),
+        47:
+        dict(
+            name='R_Thumb_3',
+            id=47,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Thumb_3'),
+        48:
+        dict(
+            name='R_Thumb_4',
+            id=48,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Thumb4'),
+        49:
+        dict(
+            name='R_Index_1',
+            id=49,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Index_1'),
+        50:
+        dict(
+            name='R_Index_2',
+            id=50,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Index_2'),
+        51:
+        dict(
+            name='R_Index_3',
+            id=51,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Index_3'),
+        52:
+        dict(
+            name='R_Index_4',
+            id=52,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Index_4'),
+        53:
+        dict(
+            name='R_Middle_1',
+            id=53,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Middle_1'),
+        54:
+        dict(
+            name='R_Middle_2',
+            id=54,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Middle_2'),
+        55:
+        dict(
+            name='R_Middle_3',
+            id=55,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Middle_3'),
+        56:
+        dict(
+            name='R_Middle_4',
+            id=56,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Middle_4'),
+        57:
+        dict(
+            name='R_Ring_1',
+            id=57,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Ring_1'),
+        58:
+        dict(
+            name='R_Ring_2',
+            id=58,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Ring_2'),
+        59:
+        dict(
+            name='R_Ring_3',
+            id=59,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Ring_3'),
+        60:
+        dict(
+            name='R_Ring_4',
+            id=60,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Ring_4'),
+        61:
+        dict(
+            name='R_Pinky_1',
+            id=61,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Pinky_1'),
+        62:
+        dict(
+            name='R_Pinky_2',
+            id=62,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Pinky_2'),
+        63:
+        dict(
+            name='R_Pinky_3',
+            id=63,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Pinky_3'),
+        64:
+        dict(
+            name='R_Pinky_4',
+            id=64,
+            color=[255, 128, 0],
+            type='',
+            swap='L_Pinky_4'),
+        65:
+        dict(name='Face_1', id=65, color=[255, 255, 255], type='', swap=''),
+        66:
+        dict(name='Face_2', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='Face_3',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_4'),
+        68:
+        dict(
+            name='Face_4',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_3'),
+        69:
+        dict(
+            name='Face_5',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_14'),
+        70:
+        dict(
+            name='Face_6',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_13'),
+        71:
+        dict(
+            name='Face_7',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_12'),
+        72:
+        dict(
+            name='Face_8',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_11'),
+        73:
+        dict(
+            name='Face_9',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_10'),
+        74:
+        dict(
+            name='Face_10',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_9'),
+        75:
+        dict(
+            name='Face_11',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_8'),
+        76:
+        dict(
+            name='Face_12',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_7'),
+        77:
+        dict(
+            name='Face_13',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_6'),
+        78:
+        dict(
+            name='Face_14',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_5'),
+        79:
+        dict(name='Face_15', id=79, color=[255, 255, 255], type='', swap=''),
+        80:
+        dict(name='Face_16', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(name='Face_17', id=81, color=[255, 255, 255], type='', swap=''),
+        82:
+        dict(name='Face_18', id=82, color=[255, 255, 255], type='', swap=''),
+        83:
+        dict(
+            name='Face_19',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_23'),
+        84:
+        dict(
+            name='Face_20',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_22'),
+        85:
+        dict(name='Face_21', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='Face_22',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_20'),
+        87:
+        dict(
+            name='Face_23',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_19'),
+        88:
+        dict(
+            name='Face_24',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_33'),
+        89:
+        dict(
+            name='Face_25',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_32'),
+        90:
+        dict(
+            name='Face_26',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_31'),
+        91:
+        dict(
+            name='Face_27',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_30'),
+        92:
+        dict(
+            name='Face_28',
+            id=92,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_35'),
+        93:
+        dict(
+            name='Face_29',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_34'),
+        94:
+        dict(
+            name='Face_30',
+            id=94,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_27'),
+        95:
+        dict(
+            name='Face_31',
+            id=95,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_26'),
+        96:
+        dict(
+            name='Face_32',
+            id=96,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_25'),
+        97:
+        dict(
+            name='Face_33',
+            id=97,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_24'),
+        98:
+        dict(
+            name='Face_34',
+            id=98,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_29'),
+        99:
+        dict(
+            name='Face_35',
+            id=99,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_28'),
+        100:
+        dict(
+            name='Face_36',
+            id=100,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_42'),
+        101:
+        dict(
+            name='Face_37',
+            id=101,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_41'),
+        102:
+        dict(
+            name='Face_38',
+            id=102,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_40'),
+        103:
+        dict(name='Face_39', id=103, color=[255, 255, 255], type='', swap=''),
+        104:
+        dict(
+            name='Face_40',
+            id=104,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_38'),
+        105:
+        dict(
+            name='Face_41',
+            id=105,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_37'),
+        106:
+        dict(
+            name='Face_42',
+            id=106,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_36'),
+        107:
+        dict(
+            name='Face_43',
+            id=107,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_47'),
+        108:
+        dict(
+            name='Face_44',
+            id=108,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_46'),
+        109:
+        dict(name='Face_45', id=109, color=[255, 255, 255], type='', swap=''),
+        110:
+        dict(
+            name='Face_46',
+            id=110,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_44'),
+        111:
+        dict(
+            name='Face_47',
+            id=111,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_43'),
+        112:
+        dict(
+            name='Face_48',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_52'),
+        113:
+        dict(
+            name='Face_49',
+            id=113,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_51'),
+        114:
+        dict(name='Face_50', id=114, color=[255, 255, 255], type='', swap=''),
+        115:
+        dict(
+            name='Face_51',
+            id=115,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_49'),
+        116:
+        dict(
+            name='Face_52',
+            id=116,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_48'),
+        117:
+        dict(
+            name='Face_53',
+            id=117,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_55'),
+        118:
+        dict(name='Face_54', id=118, color=[255, 255, 255], type='', swap=''),
+        119:
+        dict(
+            name='Face_55',
+            id=119,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_53'),
+        120:
+        dict(
+            name='Face_56',
+            id=120,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_72'),
+        121:
+        dict(
+            name='Face_57',
+            id=121,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_71'),
+        122:
+        dict(
+            name='Face_58',
+            id=122,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_70'),
+        123:
+        dict(
+            name='Face_59',
+            id=123,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_69'),
+        124:
+        dict(
+            name='Face_60',
+            id=124,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_68'),
+        125:
+        dict(
+            name='Face_61',
+            id=125,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_67'),
+        126:
+        dict(
+            name='Face_62',
+            id=126,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_66'),
+        127:
+        dict(
+            name='Face_63',
+            id=127,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_65'),
+        128:
+        dict(name='Face_64', id=128, color=[255, 255, 255], type='', swap=''),
+        129:
+        dict(
+            name='Face_65',
+            id=129,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_63'),
+        130:
+        dict(
+            name='Face_66',
+            id=130,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_62'),
+        131:
+        dict(
+            name='Face_67',
+            id=131,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_61'),
+        132:
+        dict(
+            name='Face_68',
+            id=132,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_60'),
+        133:
+        dict(
+            name='Face_69',
+            id=133,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_59'),
+        134:
+        dict(
+            name='Face_70',
+            id=134,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_58'),
+        135:
+        dict(
+            name='Face_71',
+            id=135,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_57'),
+        136:
+        dict(
+            name='Face_72',
+            id=136,
+            color=[255, 255, 255],
+            type='',
+            swap='Face_56'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Hip', 'R_Hip'), id=0, color=[0, 255, 0]),
+        1: dict(link=('L_Knee', 'R_Knee'), id=1, color=[0, 255, 0]),
+        2: dict(link=('L_Ankle', 'R_Ankle'), id=2, color=[0, 255, 0]),
+        3: dict(link=('L_Shoulder', 'R_Shoulder'), id=3, color=[0, 255, 0]),
+        4: dict(link=('L_Elbow', 'R_Elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('L_Wrist', 'R_Wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('L_Big_toe', 'R_Big_toe'), id=6, color=[0, 255, 0]),
+        7: dict(link=('L_Small_toe', 'R_Small_toe'), id=7, color=[0, 255, 0]),
+        8: dict(link=('L_Heel', 'R_Heel'), id=8, color=[0, 255, 0]),
+        9: dict(link=('L_Ear', 'R_Ear'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_Eye', 'R_Eye'), id=10, color=[0, 255, 0]),
+        11: dict(link=('L_Thumb_1', 'R_Thumb_1'), id=11, color=[255, 128, 0]),
+        12: dict(link=('L_Thumb_2', 'R_Thumb_2'), id=12, color=[255, 128, 0]),
+        13: dict(link=('L_Thumb_3', 'R_Thumb_3'), id=13, color=[255, 128, 0]),
+        14: dict(link=('L_Thumb4', 'R_Thumb_4'), id=14, color=[255, 128, 0]),
+        15: dict(link=('L_Index_1', 'R_Index_1'), id=15, color=[255, 128, 0]),
+        16: dict(link=('L_Index_2', 'R_Index_2'), id=16, color=[255, 128, 0]),
+        17: dict(link=('L_Index_3', 'R_Index_3'), id=17, color=[255, 128, 0]),
+        18: dict(link=('L_Index_4', 'R_Index_4'), id=18, color=[255, 128, 0]),
+        19:
+        dict(link=('L_Middle_1', 'R_Middle_1'), id=19, color=[255, 128, 0]),
+        20:
+        dict(link=('L_Middle_2', 'R_Middle_2'), id=20, color=[255, 128, 0]),
+        21:
+        dict(link=('L_Middle_3', 'R_Middle_3'), id=21, color=[255, 128, 0]),
+        22:
+        dict(link=('L_Middle_4', 'R_Middle_4'), id=22, color=[255, 128, 0]),
+        23: dict(link=('L_Ring_1', 'R_Ring_1'), id=23, color=[255, 128, 0]),
+        24: dict(link=('L_Ring_2', 'R_Ring_2'), id=24, color=[255, 128, 0]),
+        25: dict(link=('L_Ring_3', 'R_Ring_3'), id=25, color=[255, 128, 0]),
+        26: dict(link=('L_Ring_4', 'R_Ring_4'), id=26, color=[255, 128, 0]),
+        27: dict(link=('L_Pinky_1', 'R_Pinky_1'), id=27, color=[255, 128, 0]),
+        28: dict(link=('L_Pinky_2', 'R_Pinky_2'), id=28, color=[255, 128, 0]),
+        29: dict(link=('L_Pinky_3', 'R_Pinky_3'), id=29, color=[255, 128, 0]),
+        30: dict(link=('L_Pinky_4', 'R_Pinky_4'), id=30, color=[255, 128, 0]),
+        31: dict(link=('Face_3', 'Face_4'), id=31, color=[255, 255, 255]),
+        32: dict(link=('Face_5', 'Face_14'), id=32, color=[255, 255, 255]),
+        33: dict(link=('Face_6', 'Face_13'), id=33, color=[255, 255, 255]),
+        34: dict(link=('Face_7', 'Face_12'), id=34, color=[255, 255, 255]),
+        35: dict(link=('Face_8', 'Face_11'), id=35, color=[255, 255, 255]),
+        36: dict(link=('Face_9', 'Face_10'), id=36, color=[255, 255, 255]),
+        37: dict(link=('Face_19', 'Face_23'), id=37, color=[255, 255, 255]),
+        38: dict(link=('Face_20', 'Face_22'), id=38, color=[255, 255, 255]),
+        39: dict(link=('Face_24', 'Face_33'), id=39, color=[255, 255, 255]),
+        40: dict(link=('Face_25', 'Face_32'), id=40, color=[255, 255, 255]),
+        41: dict(link=('Face_26', 'Face_31'), id=41, color=[255, 255, 255]),
+        42: dict(link=('Face_27', 'Face_30'), id=42, color=[255, 255, 255]),
+        43: dict(link=('Face_28', 'Face_35'), id=43, color=[255, 255, 255]),
+        44: dict(link=('Face_29', 'Face_34'), id=44, color=[255, 255, 255]),
+        45: dict(link=('Face_36', 'Face_42'), id=45, color=[255, 255, 255]),
+        46: dict(link=('Face_37', 'Face_41'), id=46, color=[255, 255, 255]),
+        47: dict(link=('Face_38', 'Face_40'), id=47, color=[255, 255, 255]),
+        48: dict(link=('Face_43', 'Face_47'), id=48, color=[255, 255, 255]),
+        49: dict(link=('Face_44', 'Face_46'), id=49, color=[255, 255, 255]),
+        50: dict(link=('Face_48', 'Face_52'), id=50, color=[255, 255, 255]),
+        51: dict(link=('Face_49', 'Face_51'), id=51, color=[255, 255, 255]),
+        52: dict(link=('Face_53', 'Face_55'), id=52, color=[255, 255, 255]),
+        53: dict(link=('Face_56', 'Face_72'), id=53, color=[255, 255, 255]),
+        54: dict(link=('Face_57', 'Face_71'), id=54, color=[255, 255, 255]),
+        55: dict(link=('Face_58', 'Face_70'), id=55, color=[255, 255, 255]),
+        56: dict(link=('Face_59', 'Face_69'), id=56, color=[255, 255, 255]),
+        57: dict(link=('Face_60', 'Face_68'), id=57, color=[255, 255, 255]),
+        58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]),
+        59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]),
+        60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]),
+    })

From 4dbdbe17581bc750ea57140ccd964cf32e313e50 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Fri, 25 Aug 2023 16:13:20 +0800
Subject: [PATCH 05/21] --other=update dataset

---
 .../datasets/body3d/ubody3d_dataset.py        | 334 ++++++------------
 .../dataset_converters/ubody_smplx_to_coco.py |  53 +--
 2 files changed, 133 insertions(+), 254 deletions(-)

diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
index a7aca146ca..dca81ffc26 100644
--- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -1,18 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from collections import defaultdict
-from typing import Callable, List, Optional, Sequence, Tuple, Union
+from typing import Optional
 
 import numpy as np
 from mmengine.fileio import exists, get_local_path
-from mmengine.utils import is_abs
+from xtcocotools.coco import COCO
 
-from mmpose.datasets.datasets import BaseMocapDataset
+from mmpose.datasets.datasets import BaseCocoStyleDataset
 from mmpose.registry import DATASETS
 
 
 @DATASETS.register_module()
-class UBody3dDataset(BaseMocapDataset):
+class UBody3dDataset(BaseCocoStyleDataset):
     """Ubody3d dataset for 3D human pose estimation.
 
     "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer",
@@ -29,37 +28,11 @@ class UBody3dDataset(BaseMocapDataset):
 
     Args:
         ann_file (str): Annotation file path. Default: ''.
-        seq_len (int): Number of frames in a sequence. Default: 1.
-        seq_step (int): The interval for extracting frames from the video.
-            Default: 1.
-        multiple_target (int): If larger than 0, merge every
-            ``multiple_target`` sequence together. Default: 0.
-        multiple_target_step (int): The interval for merging sequence. Only
-            valid when ``multiple_target`` is larger than 0. Default: 0.
-        pad_video_seq (bool): Whether to pad the video so that poses will be
-            predicted for every frame in the video. Default: ``False``.
-        causal (bool): If set to ``True``, the rightmost input frame will be
-            the target frame. Otherwise, the middle input frame will be the
-            target frame. Default: ``True``.
-        subset_frac (float): The fraction to reduce dataset size. If set to 1,
-            the dataset size is not reduced. Default: 1.
-        keypoint_2d_src (str): Specifies 2D keypoint information options, which
-            should be one of the following options:
-
-            - ``'gt'``: load from the annotation file
-            - ``'detection'``: load from a detection
-              result file of 2D keypoint
-            - 'pipeline': the information will be generated by the pipeline
-
-            Default: ``'gt'``.
-        keypoint_2d_det_file (str, optional): The 2D keypoint detection file.
-            If set, 2d keypoint loaded from this file will be used instead of
-            ground-truth keypoints. This setting is only when
-            ``keypoint_2d_src`` is ``'detection'``. Default: ``None``.
-        factor_file (str, optional): The projection factors' file. If set,
-            factor loaded from this file will be used instead of calculated
-            factors. Default: ``None``.
-        camera_param_file (str): Cameras' parameters file. Default: ``None``.
+        bbox_file (str, optional): Detection result file path. If
+            ``bbox_file`` is set, detected bboxes loaded from this file will
+            be used instead of ground-truth bboxes. This setting is only for
+            evaluation, i.e., ignored when ``test_mode`` is ``False``.
+            Default: ``None``.
         data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
             ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
             one instance; while in ``'bottomup'`` mode, each data sample
@@ -89,214 +62,117 @@ class UBody3dDataset(BaseMocapDataset):
         max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
             None img. The maximum extra number of cycles to get a valid
             image. Default: 1000.
+        sample_interval (int, optional): The sample interval of the dataset.
+            Default: 1.
     """
 
     METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py')
-    SUPPORTED_keypoint_2d_src = {'gt', 'detection', 'pipeline'}
-
-    def __init__(self,
-                 ann_file: str = '',
-                 seq_len: int = 1,
-                 seq_step: int = 1,
-                 multiple_target: int = 0,
-                 multiple_target_step: int = 0,
-                 pad_video_seq: bool = False,
-                 causal: bool = True,
-                 subset_frac: float = 1.0,
-                 keypoint_2d_src: str = 'gt',
-                 keypoint_2d_det_file: Optional[str] = None,
-                 factor_file: Optional[str] = None,
-                 camera_param_file: Optional[str] = None,
-                 data_mode: str = 'topdown',
-                 metainfo: Optional[dict] = None,
-                 data_root: Optional[str] = None,
-                 data_prefix: dict = dict(img=''),
-                 filter_cfg: Optional[dict] = None,
-                 indices: Optional[Union[int, Sequence[int]]] = None,
-                 serialize_data: bool = True,
-                 pipeline: List[Union[dict, Callable]] = [],
-                 test_mode: bool = False,
-                 lazy_init: bool = False,
-                 max_refetch: int = 1000):
-        # check keypoint_2d_src
-        self.keypoint_2d_src = keypoint_2d_src
-        if self.keypoint_2d_src not in self.SUPPORTED_keypoint_2d_src:
-            raise ValueError(
-                f'Unsupported `keypoint_2d_src` "{self.keypoint_2d_src}". '
-                f'Supported options are {self.SUPPORTED_keypoint_2d_src}')
-
-        if keypoint_2d_det_file:
-            if not is_abs(keypoint_2d_det_file):
-                self.keypoint_2d_det_file = osp.join(data_root,
-                                                     keypoint_2d_det_file)
-            else:
-                self.keypoint_2d_det_file = keypoint_2d_det_file
 
-        self.seq_step = seq_step
-        self.pad_video_seq = pad_video_seq
+    def _load_annotations(self):
+        """Load data from annotations in COCO format."""
+
+        assert exists(self.ann_file), (
+            f'Annotation file `{self.ann_file}`does not exist')
+
+        with get_local_path(self.ann_file) as local_path:
+            self.coco = COCO(local_path)
+        # set the metainfo about categories, which is a list of dict
+        # and each dict contains the 'id', 'name', etc. about this category
+        self._metainfo['CLASSES'] = self.coco.loadCats(self.coco.getCatIds())
+
+        instance_list = []
+        image_list = []
+
+        for img_id in self.coco.getImgIds():
+            if img_id % self.sample_interval != 0:
+                continue
+            img = self.coco_loadImgs(img_id)[0]
+            img.update({
+                'img_id':
+                img_id,
+                'img_path':
+                osp.join(self.data_prefix['img'], img['file_name']),
+            })
+            image_list.append(img)
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            for ann in self.coco.loadAnns(ann_ids):
+                if instance_info := self.parse_data_info(
+                        dict(raw_ann_info=ann, raw_img_info=img)):
+                    instance_list.append(instance_info)
 
-        if factor_file:
-            if not is_abs(factor_file):
-                factor_file = osp.join(data_root, factor_file)
-            assert exists(factor_file), (f'`factor_file`: {factor_file}'
-                                         'does not exist.')
-        self.factor_file = factor_file
+        return instance_list, image_list
 
-        if multiple_target > 0 and multiple_target_step == 0:
-            multiple_target_step = multiple_target
-        self.multiple_target_step = multiple_target_step
+    def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
+        """Parse raw COCO annotation of an instance.
 
-        super().__init__(
-            ann_file=ann_file,
-            seq_len=seq_len,
-            multiple_target=multiple_target,
-            causal=causal,
-            subset_frac=subset_frac,
-            camera_param_file=camera_param_file,
-            data_mode=data_mode,
-            metainfo=metainfo,
-            data_root=data_root,
-            data_prefix=data_prefix,
-            filter_cfg=filter_cfg,
-            indices=indices,
-            serialize_data=serialize_data,
-            pipeline=pipeline,
-            test_mode=test_mode,
-            lazy_init=lazy_init,
-            max_refetch=max_refetch)
+        Args:
+            raw_data_info (dict): Raw data information loaded from
+                ``ann_file``. It should have following contents:
 
-    def get_sequence_indices(self) -> List[List[int]]:
-        """Split original videos into sequences and build frame indices.
+                - ``'raw_ann_info'``: Raw annotation of an instance
+                - ``'raw_img_info'``: Raw information of the image that
+                    contains the instance
 
-        This method overrides the default one in the base class.
+        Returns:
+                dict | None: Parsed instance annotation
         """
-        imgnames = self.ann_data['imgname']
-        video_frames = defaultdict(list)
-        for idx, imgname in enumerate(imgnames):
-            subj, action, camera = self._parse_h36m_imgname(imgname)
-            video_frames[(subj, action, camera)].append(idx)
-
-        # build sample indices
-        sequence_indices = []
-        _len = (self.seq_len - 1) * self.seq_step + 1
-        _step = self.seq_step
-
-        if self.multiple_target:
-            for _, _indices in sorted(video_frames.items()):
-                n_frame = len(_indices)
-                seqs_from_video = [
-                    _indices[i:(i + self.multiple_target):_step]
-                    for i in range(0, n_frame, self.multiple_target_step)
-                ][:(n_frame + self.multiple_target_step -
-                    self.multiple_target) // self.multiple_target_step]
-                sequence_indices.extend(seqs_from_video)
-
-        else:
-            for _, _indices in sorted(video_frames.items()):
-                n_frame = len(_indices)
 
-                if self.pad_video_seq:
-                    # Pad the sequence so that every frame in the sequence will
-                    # be predicted.
-                    if self.causal:
-                        frames_left = self.seq_len - 1
-                        frames_right = 0
-                    else:
-                        frames_left = (self.seq_len - 1) // 2
-                        frames_right = frames_left
-                    for i in range(n_frame):
-                        pad_left = max(0, frames_left - i // _step)
-                        pad_right = max(
-                            0, frames_right - (n_frame - 1 - i) // _step)
-                        start = max(i % _step, i - frames_left * _step)
-                        end = min(n_frame - (n_frame - 1 - i) % _step,
-                                  i + frames_right * _step + 1)
-                        sequence_indices.append([_indices[0]] * pad_left +
-                                                _indices[start:end:_step] +
-                                                [_indices[-1]] * pad_right)
-                else:
-                    seqs_from_video = [
-                        _indices[i:(i + _len):_step]
-                        for i in range(0, n_frame - _len + 1)
-                    ]
-                    sequence_indices.extend(seqs_from_video)
+        ann = raw_data_info['raw_ann_info']
+        if 'bbox' not in ann or 'keypoints3d' not in ann:
+            return None
 
-        # reduce dataset size if needed
-        subset_size = int(len(sequence_indices) * self.subset_frac)
-        start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
-        end = start + subset_size
+        img = raw_data_info['raw_img_info']
+        img_w, img_h = img['width'], img['height']
 
-        sequence_indices = sequence_indices[start:end]
+        # get bbox in shape [1, 4], formatted as xywh
+        x, y, w, h = ann['bbox']
+        x1 = np.clip(x, 0, img_w - 1)
+        y1 = np.clip(y, 0, img_h - 1)
+        x2 = np.clip(x + w, 0, img_w - 1)
+        y2 = np.clip(y + h, 0, img_h - 1)
 
-        return sequence_indices
+        bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4)
 
-    def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
-        instance_list, image_list = super()._load_annotations()
+        # keypoints in shape [1, K, 2] and keypoints_visible in [1, K]
+        _keypoints = np.array(
+            ann['keypoints'], dtype=np.float32).reshape(1, -1, 3)
+        keypoints = _keypoints[..., :2]
+        keypoints_visible = np.minimum(1, _keypoints[..., 2])
 
-        h36m_data = self.ann_data
-        kpts_3d = h36m_data['S']
+        _keypoints_3d = np.array(
+            ann['keypoints3d'], dtype=np.float32).reshape(1, -1, 4)
+        keypoints_3d = _keypoints_3d[..., :3]
+        keypoints_3d_visible = keypoints_visible
 
-        if self.keypoint_2d_src == 'detection':
-            assert exists(self.keypoint_2d_det_file), (
-                f'`keypoint_2d_det_file`: `{self.keypoint_2d_det_file}`'
-                'does not exist.')
-            kpts_2d = self._load_keypoint_2d_detection(
-                self.keypoint_2d_det_file)
-            assert kpts_2d.shape[0] == kpts_3d.shape[0], (
-                f'Number of `kpts_2d` ({kpts_2d.shape[0]}) does not match '
-                f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
-
-            assert kpts_2d.shape[2] == 3, (
-                f'Expect `kpts_2d.shape[2]` == 3, but got '
-                f'{kpts_2d.shape[2]}. Please check the format of '
-                f'{self.keypoint_2d_det_file}')
-
-            for idx, frame_ids in enumerate(self.sequence_indices):
-                kpt_2d = kpts_2d[frame_ids].astype(np.float32)
-                keypoints = kpt_2d[..., :2]
-                keypoints_visible = kpt_2d[..., 2]
-                instance_list[idx].update({
-                    'keypoints':
-                    keypoints,
-                    'keypoints_visible':
-                    keypoints_visible
-                })
-        if self.factor_file:
-            with get_local_path(self.factor_file) as local_path:
-                factors = np.load(local_path).astype(np.float32)
+        if 'num_keypoints' in ann:
+            num_keypoints = ann['num_keypoints']
         else:
-            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
-        assert factors.shape[0] == kpts_3d.shape[0], (
-            f'Number of `factors` ({factors.shape[0]}) does not match '
-            f'number of `kpts_3d` ({kpts_3d.shape[0]}).')
-
-        for idx, frame_ids in enumerate(self.sequence_indices):
-            factor = factors[frame_ids].astype(np.float32)
-            instance_list[idx].update({'factor': factor})
-
-        return instance_list, image_list
-
-    @staticmethod
-    def _parse_h36m_imgname(imgname) -> Tuple[str, str, str]:
-        """Parse imgname to get information of subject, action and camera.
-
-        A typical h36m image filename is like:
-        S1_Directions_1.54138969_000001.jpg
-        """
-        subj, rest = osp.basename(imgname).split('_', 1)
-        action, rest = rest.split('.', 1)
-        camera, rest = rest.split('_', 1)
-        return subj, action, camera
-
-    def get_camera_param(self, imgname) -> dict:
-        """Get camera parameters of a frame by its image name."""
-        assert hasattr(self, 'camera_param')
-        subj, _, camera = self._parse_h36m_imgname(imgname)
-        return self.camera_param[(subj, camera)]
-
-    def _load_keypoint_2d_detection(self, det_file):
-        """"Load 2D joint detection results from file."""
-        with get_local_path(det_file) as local_path:
-            kpts_2d = np.load(local_path).astype(np.float32)
-
-        return kpts_2d
+            num_keypoints = np.count_nonzero(keypoints.max(axis=2))
+
+        scale = ann.get('scale', 0.0)
+        center = ann.get('center', np.array([0.0, 0.0]))
+
+        instance_info = {
+            'num_keypoints': num_keypoints,
+            'keypoints': keypoints,
+            'keypoints_visible': keypoints_visible,
+            'keypoints_3d': keypoints_3d,
+            'keypoints_3d_visible': keypoints_3d_visible,
+            'bbox': bbox,
+            'bbox_score': np.ones(1, dtype=np.float32),
+            'scale': scale,
+            'center': center,
+            'id': ann['id'],
+            'category_id': 1,
+            'iscrowd': ann.get('iscrowd', 0),
+            'segmentation': ann.get('segmentation', None),
+            'img_path': img['img_path'],
+            'img_id': ann['image_id'],
+            'lifting_target': keypoints_3d[[-1]],
+            'lifting_target_visible': keypoints_3d_visible[[-1]],
+            'target_img_path': img['img_path'],
+        }
+        if 'crowdIndex' in img:
+            instance_info['crowd_index'] = img['crowdIndex']
+        return instance_info
diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index 8bb796377f..ac8b55683f 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -37,6 +37,8 @@ def __init__(self, human_model_path):
             use_pca=False,
             use_face_contour=True,
             **self.layer_args)
+        if torch.cuda.is_available():
+            self.neutral_model = self.neutral_model.to('cuda:0')
 
         self.vertex_num = 10475
         self.face = self.neutral_model.faces
@@ -254,10 +256,10 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                                  dtype=np.float32)
         coord_valid = np.ones((human_model.joint_num), dtype=np.float32)
 
-        root_pose, body_pose, shape, trans = (human_model_param['root_pose'],
-                                              human_model_param['body_pose'],
-                                              human_model_param['shape'],
-                                              human_model_param['trans'])
+        root_pose = human_model_param['root_pose']
+        body_pose = human_model_param['body_pose']
+        shape = human_model_param['shape']
+        trans = human_model_param['trans']
 
         if 'lhand_pose' in human_model_param and human_model_param.get(
                 'lhand_valid', False):
@@ -291,15 +293,18 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
             coord_valid[human_model.orig_joint_part['face']] = 0
 
         # init human model inputs
-        root_pose = torch.FloatTensor(root_pose).view(1, 3)
-        body_pose = torch.FloatTensor(body_pose).view(-1, 3)
-        lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3)
-        rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3)
-        jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3)
-        shape = torch.FloatTensor(shape).view(1, -1)
-        expr = torch.FloatTensor(expr).view(1, -1)
-        trans = torch.FloatTensor(trans).view(1, -1)
-        zero_pose = torch.zeros((1, 3), dtype=torch.float32)
+        device = torch.device(
+            'cuda') if torch.cuda.is_available() else torch.device('cpu')
+        root_pose = torch.FloatTensor(root_pose, device=device).view(1, 3)
+        body_pose = torch.FloatTensor(body_pose, device=device).view(-1, 3)
+        lhand_pose = torch.FloatTensor(lhand_pose, device=device).view(-1, 3)
+        rhand_pose = torch.FloatTensor(rhand_pose, device=device).view(-1, 3)
+        jaw_pose = torch.FloatTensor(jaw_pose, device=device).view(-1, 3)
+        shape = torch.FloatTensor(shape, device=device).view(1, -1)
+        expr = torch.FloatTensor(expr, device=device).view(1, -1)
+        trans = torch.FloatTensor(trans, device=device).view(1, -1)
+        zero_pose = torch.zeros((1, 3), dtype=torch.float32, device=device)
+
         with torch.no_grad():
             output = human_model.neutral_model(
                 betas=shape,
@@ -346,23 +351,21 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                   2] = ((joint_cam[human_model.joint_part['face'], 2].copy() /
                          (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
 
-        keypoints_2d = joint_img[:, :2].copy()
-        ann_3d = {
-            **ann,
-            'keypoints_3d': joint_cam.tolist(),
-        }
-        ann_3d['keypoints'] = keypoints_2d.tolist()
-
+        keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid],
+                                      axis=1)
+        keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1)
+        ann['keypoints'] = keypoints_2d.tolist()
+        ann['keypoints_3d'] = keypoints_3d.tolist()
         img['file_name'] = os.path.join(scene, file_name)
         if video_name in splits:
-            val_annos.append(ann_3d)
+            val_annos.append(ann)
             val_imgs.append(img)
         else:
-            train_annos.append(ann_3d)
+            train_annos.append(ann)
             train_imgs.append(img)
         progress_bar.update()
 
-    categoreis = [{
+    categories = [{
         'supercategory': 'person',
         'id': 1,
         'name': 'person',
@@ -372,12 +375,12 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
     train_data = {
         'images': train_imgs,
         'annotations': train_annos,
-        'categories': categoreis
+        'categories': categories
     }
     val_data = {
         'images': val_imgs,
         'annotations': val_annos,
-        'categories': categoreis
+        'categories': categories
     }
 
     mmengine.dump(

From c04c835119f28d4947fc90607fe26fd3f54a90d2 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Fri, 25 Aug 2023 16:21:53 +0800
Subject: [PATCH 06/21] --other=update config

---
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 72 ++-----------------
 1 file changed, 5 insertions(+), 67 deletions(-)

diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
index 132038ba93..0b418c6607 100644
--- a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
+++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
@@ -1,4 +1,4 @@
-_base_ = ['../../../_base_/default_runtime.py']
+_base_ = ['../../_base_/default_runtime.py']
 
 vis_backends = [
     dict(type='LocalVisBackend'),
@@ -28,76 +28,15 @@
         max_keep_ckpts=1))
 
 # codec settings
-# 3D keypoint normalization parameters
-# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl'
-target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04],
-               [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02],
-               [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01],
-               [2.55651315e-04, 7.11954606e-03, 9.81423866e-04],
-               [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02],
-               [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01],
-               [2.90583676e-03, -2.11363307e-01, -4.74210915e-02],
-               [5.67537804e-03, -4.35088906e-01, -9.76974016e-02],
-               [5.93884964e-03, -4.91891970e-01, -1.10666618e-01],
-               [7.37352083e-03, -5.83948619e-01, -1.31171400e-01],
-               [5.41920653e-03, -3.83931702e-01, -8.68145417e-02],
-               [2.95964662e-03, -1.87567488e-01, -4.34536934e-02],
-               [1.26585822e-03, -1.20170579e-01, -2.82526049e-02],
-               [4.67186639e-03, -3.83644089e-01, -8.55125784e-02],
-               [1.67648571e-03, -1.97007177e-01, -4.31368364e-02],
-               [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]],
-target_std = [[0.11072244, 0.02238818, 0.07246294],
-              [0.15856311, 0.18933832, 0.20880479],
-              [0.19179935, 0.24320062, 0.24756193],
-              [0.11072181, 0.02238805, 0.07246253],
-              [0.15880454, 0.19977188, 0.2147063],
-              [0.18001944, 0.25052739, 0.24853247],
-              [0.05210694, 0.05211406, 0.06908241],
-              [0.09515367, 0.10133032, 0.12899733],
-              [0.11742458, 0.12648469, 0.16465091],
-              [0.12360297, 0.13085539, 0.16433336],
-              [0.14602232, 0.09707956, 0.13952731],
-              [0.24347532, 0.12982249, 0.20230181],
-              [0.2446877, 0.21501816, 0.23938235],
-              [0.13876084, 0.1008926, 0.1424411],
-              [0.23687529, 0.14491219, 0.20980829],
-              [0.24400695, 0.23975028, 0.25520584]]
-# 2D keypoint normalization parameters
-# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl'
-keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141],
-                  [530.68456967, 493.54259285], [529.36968722, 575.96448516],
-                  [532.29767646, 421.28483336], [531.93946631, 494.72186795],
-                  [529.71984447, 578.96110365], [532.93699382, 370.65225054],
-                  [534.1101856, 317.90342311], [534.55416813, 304.24143901],
-                  [534.86955004, 282.31030885], [534.11308566, 330.11296796],
-                  [533.53637525, 376.2742511], [533.49380107, 391.72324565],
-                  [533.52579142, 330.09494668], [532.50804964, 374.190479],
-                  [532.72786934, 380.61615716]],
-keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443],
-                 [119.12412107, 50.53806215], [120.61688045, 56.38444891],
-                 [101.95735275, 62.89636486], [106.24832897, 48.41178119],
-                 [108.46734966, 54.58177071], [109.07369806, 68.70443672],
-                 [111.20130351, 74.87287863], [111.63203838, 77.80542514],
-                 [113.22330788, 79.90670556], [105.7145833, 73.27049436],
-                 [107.05804267, 73.93175781], [107.97449418, 83.30391802],
-                 [121.60675105, 74.25691526], [134.34378973, 77.48125087],
-                 [131.79990652, 89.86721124]]
 codec = dict(
-    type='ImagePoseLifting',
-    num_keypoints=17,
-    root_index=0,
-    remove_root=True,
-    target_mean=target_mean,
-    target_std=target_std,
-    keypoints_mean=keypoints_mean,
-    keypoints_std=keypoints_std)
+    type='ImagePoseLifting', num_keypoints=137, root_index=0, remove_root=True)
 
 # model settings
 model = dict(
     type='PoseLifter',
     backbone=dict(
         type='TCN',
-        in_channels=2 * 17,
+        in_channels=2 * 137,
         stem_channels=1024,
         num_blocks=2,
         kernel_sizes=(1, 1, 1),
@@ -106,7 +45,7 @@
     head=dict(
         type='TemporalRegressionHead',
         in_channels=1024,
-        num_joints=16,
+        num_joints=137,
         loss=dict(type='MSELoss'),
         decoder=codec,
     ))
@@ -152,8 +91,7 @@
     dict(
         type='PackPoseInputs',
         meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
-                   'target_root', 'target_root_index', 'target_mean',
-                   'target_std'))
+                   'target_root', 'target_root_index'))
 ]
 val_pipeline = train_pipeline
 

From 5796f9c323e932028b390616bec75cac21b442cb Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Fri, 25 Aug 2023 16:38:57 +0800
Subject: [PATCH 07/21] --fix=fix script

---
 .../dataset_converters/ubody_smplx_to_coco.py | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index ac8b55683f..2598b9a469 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -214,11 +214,12 @@ def cam2pixel(cam_coord, f, c):
 
 
 def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
-                       human_model: SMPLX):
+                       human_model_path: str):
     annos = read_annotation_file(
         osp.join(annotation_root, scene, 'smplx_annotation.json'))
     keypoint_annos = COCO(
         osp.join(annotation_root, scene, 'keypoint_annotation.json'))
+    human_model = SMPLX(human_model_path)
 
     train_annos = []
     val_annos = []
@@ -294,15 +295,15 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
 
         # init human model inputs
         device = torch.device(
-            'cuda') if torch.cuda.is_available() else torch.device('cpu')
-        root_pose = torch.FloatTensor(root_pose, device=device).view(1, 3)
-        body_pose = torch.FloatTensor(body_pose, device=device).view(-1, 3)
-        lhand_pose = torch.FloatTensor(lhand_pose, device=device).view(-1, 3)
-        rhand_pose = torch.FloatTensor(rhand_pose, device=device).view(-1, 3)
-        jaw_pose = torch.FloatTensor(jaw_pose, device=device).view(-1, 3)
-        shape = torch.FloatTensor(shape, device=device).view(1, -1)
-        expr = torch.FloatTensor(expr, device=device).view(1, -1)
-        trans = torch.FloatTensor(trans, device=device).view(1, -1)
+            'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+        root_pose = torch.FloatTensor(root_pose).to(device).view(1, 3)
+        body_pose = torch.FloatTensor(body_pose).to(device).view(-1, 3)
+        lhand_pose = torch.FloatTensor(lhand_pose).to(device).view(-1, 3)
+        rhand_pose = torch.FloatTensor(rhand_pose).to(device).view(-1, 3)
+        jaw_pose = torch.FloatTensor(jaw_pose).to(device).view(-1, 3)
+        shape = torch.FloatTensor(shape).to(device).view(1, -1)
+        expr = torch.FloatTensor(expr).to(device).view(1, -1)
+        trans = torch.FloatTensor(trans).to(device).view(1, -1)
         zero_pose = torch.zeros((1, 3), dtype=torch.float32, device=device)
 
         with torch.no_grad():
@@ -318,7 +319,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                 reye_pose=zero_pose,
                 expression=expr)
 
-        joint_cam = output.joints[0].numpy()[human_model.joint_idx, :]
+        joint_cam = output.joints[0].cpu().numpy()[human_model.joint_idx, :]
         joint_img = cam2pixel(joint_cam, cam_param['focal'],
                               cam_param['princpt'])
 
@@ -351,6 +352,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                   2] = ((joint_cam[human_model.joint_part['face'], 2].copy() /
                          (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
 
+        coord_valid = coord_valid.reshape((-1, 1))
         keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid],
                                       axis=1)
         keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1)
@@ -403,7 +405,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
     annotation_path = f'{args.data_root}/annotations'
 
     folders = os.listdir(annotation_path)
-    human_model = SMPLX(args.human_model_path)
+    human_model_path = args.human_model_path
     splits = np.load(split_path)
 
     if args.nproc > 1:
@@ -412,11 +414,11 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                 process_scene_anno,
                 annotation_root=annotation_path,
                 splits=splits,
-                human_model=human_model), folders, args.nproc)
+                human_model_path=human_model_path), folders, args.nproc)
     else:
         mmengine.track_progress(
             partial(
                 process_scene_anno,
                 annotation_root=annotation_path,
                 splits=splits,
-                human_model=human_model), folders)
+                human_model_path=human_model_path), folders)

From 273ef8446079bca9659c5a56d0c0996d788ccde0 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Fri, 25 Aug 2023 17:09:34 +0800
Subject: [PATCH 08/21] --other=fix

---
 tools/dataset_converters/ubody_smplx_to_coco.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index 2598b9a469..99bbfa3fda 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -244,6 +244,9 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
         if not os.path.exists(img_path):
             progress_bar.update()
             continue
+        if aid not in annos:
+            progress_bar.update()
+            continue
 
         smplx_param = annos[str(aid)]
         human_model_param = smplx_param['smplx_param']

From d78287c934e66bcbe19046cec3221052bc973d40 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 29 Aug 2023 15:22:56 +0800
Subject: [PATCH 09/21] --feat=add simple mpjpe metric

---
 configs/_base_/datasets/ubody3d.py            |   4 +-
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py |   6 +-
 .../datasets/body3d/ubody3d_dataset.py        |   6 +-
 mmpose/evaluation/metrics/__init__.py         |   3 +-
 .../metrics/simple_keypoint_3d_metrics.py     | 119 ++++++++++++++++++
 5 files changed, 130 insertions(+), 8 deletions(-)
 create mode 100644 mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py

diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py
index a971ef4614..e2dfe0c570 100644
--- a/configs/_base_/datasets/ubody3d.py
+++ b/configs/_base_/datasets/ubody3d.py
@@ -937,4 +937,6 @@
         58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]),
         59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]),
         60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]),
-    })
+    },
+    joint_weights=[1.] * 137,
+    sigmas=[])
diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
index 0b418c6607..751905b79e 100644
--- a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
+++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
@@ -45,7 +45,7 @@
     head=dict(
         type='TemporalRegressionHead',
         in_channels=1024,
-        num_joints=137,
+        num_joints=136,
         loss=dict(type='MSELoss'),
         decoder=codec,
     ))
@@ -124,7 +124,7 @@
 
 # evaluators
 val_evaluator = [
-    dict(type='MPJPE', mode='mpjpe'),
-    dict(type='MPJPE', mode='p-mpjpe')
+    dict(type='SimpleMPJPE', mode='mpjpe'),
+    dict(type='SimpleMPJPE', mode='p-mpjpe')
 ]
 test_evaluator = val_evaluator
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
index dca81ffc26..468553abf5 100644
--- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -86,7 +86,7 @@ def _load_annotations(self):
         for img_id in self.coco.getImgIds():
             if img_id % self.sample_interval != 0:
                 continue
-            img = self.coco_loadImgs(img_id)[0]
+            img = self.coco.loadImgs(img_id)[0]
             img.update({
                 'img_id':
                 img_id,
@@ -119,7 +119,7 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
         """
 
         ann = raw_data_info['raw_ann_info']
-        if 'bbox' not in ann or 'keypoints3d' not in ann:
+        if 'bbox' not in ann or 'keypoints_3d' not in ann:
             return None
 
         img = raw_data_info['raw_img_info']
@@ -141,7 +141,7 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
         keypoints_visible = np.minimum(1, _keypoints[..., 2])
 
         _keypoints_3d = np.array(
-            ann['keypoints3d'], dtype=np.float32).reshape(1, -1, 4)
+            ann['keypoints_3d'], dtype=np.float32).reshape(1, -1, 4)
         keypoints_3d = _keypoints_3d[..., :3]
         keypoints_3d_visible = keypoints_visible
 
diff --git a/mmpose/evaluation/metrics/__init__.py b/mmpose/evaluation/metrics/__init__.py
index ac7e21b5cc..2c2843cebe 100644
--- a/mmpose/evaluation/metrics/__init__.py
+++ b/mmpose/evaluation/metrics/__init__.py
@@ -6,9 +6,10 @@
 from .keypoint_3d_metrics import MPJPE
 from .keypoint_partition_metric import KeypointPartitionMetric
 from .posetrack18_metric import PoseTrack18Metric
+from .simple_keypoint_3d_metrics import SimpleMPJPE
 
 __all__ = [
     'CocoMetric', 'PCKAccuracy', 'MpiiPCKAccuracy', 'JhmdbPCKAccuracy', 'AUC',
     'EPE', 'NME', 'PoseTrack18Metric', 'CocoWholeBodyMetric',
-    'KeypointPartitionMetric', 'MPJPE'
+    'KeypointPartitionMetric', 'MPJPE', 'SimpleMPJPE'
 ]
diff --git a/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py b/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py
new file mode 100644
index 0000000000..dc0065d5b9
--- /dev/null
+++ b/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmpose.registry import METRICS
+from ..functional import keypoint_mpjpe
+
+
+@METRICS.register_module()
+class SimpleMPJPE(BaseMetric):
+    """MPJPE evaluation metric.
+
+    Calculate the mean per-joint position error (MPJPE) of keypoints.
+
+    Note:
+        - length of dataset: N
+        - num_keypoints: K
+        - number of keypoint dimensions: D (typically D = 2)
+
+    Args:
+        mode (str): Method to align the prediction with the
+            ground truth. Supported options are:
+
+                - ``'mpjpe'``: no alignment will be applied
+                - ``'p-mpjpe'``: align in the least-square sense in scale
+                - ``'n-mpjpe'``: align in the least-square sense in
+                    scale, rotation, and translation.
+
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be ``'cpu'`` or
+            ``'gpu'``. Default: ``'cpu'``.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, ``self.default_prefix``
+            will be used instead. Default: ``None``.
+        skip_list (list, optional): The list of subject and action combinations
+            to be skipped. Default: [].
+    """
+
+    ALIGNMENT = {'mpjpe': 'none', 'p-mpjpe': 'procrustes', 'n-mpjpe': 'scale'}
+
+    def __init__(self,
+                 mode: str = 'mpjpe',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 skip_list: List[str] = []) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        allowed_modes = self.ALIGNMENT.keys()
+        if mode not in allowed_modes:
+            raise KeyError("`mode` should be 'mpjpe', 'p-mpjpe', or "
+                           f"'n-mpjpe', but got '{mode}'.")
+
+        self.mode = mode
+        self.skip_list = skip_list
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[dict]): A batch of data
+                from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for data_sample in data_samples:
+            # predicted keypoints coordinates, [T, K, D]
+            pred_coords = data_sample['pred_instances']['keypoints']
+            if pred_coords.ndim == 4:
+                pred_coords = np.squeeze(pred_coords, axis=0)
+            # ground truth data_info
+            gt = data_sample['gt_instances']
+            # ground truth keypoints coordinates, [T, K, D]
+            gt_coords = gt['lifting_target']
+            # ground truth keypoints_visible, [T, K, 1]
+            mask = gt['lifting_target_visible'].astype(bool).reshape(
+                gt_coords.shape[0], -1)
+
+            result = {
+                'pred_coords': pred_coords,
+                'gt_coords': gt_coords,
+                'mask': mask,
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are the corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # pred_coords: [N, K, D]
+        pred_coords = np.concatenate(
+            [result['pred_coords'] for result in results])
+        # gt_coords: [N, K, D]
+        gt_coords = np.concatenate([result['gt_coords'] for result in results])
+        # mask: [N, K]
+        mask = np.concatenate([result['mask'] for result in results])
+
+        error_name = self.mode.upper()
+
+        logger.info(f'Evaluating {self.mode.upper()}...')
+        return {
+            error_name:
+            keypoint_mpjpe(pred_coords, gt_coords, mask,
+                           self.ALIGNMENT[self.mode])
+        }

From 9f9b9937a4439717168e9b83f3dd5e6bff8611d1 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 5 Sep 2023 14:58:01 +0800
Subject: [PATCH 10/21] --other=convert to coco and h36m

---
 .../dataset_converters/ubody_smplx_to_coco.py | 66 +++++++++++++++++--
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index 99bbfa3fda..a3af141c4d 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -355,12 +355,70 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                   2] = ((joint_cam[human_model.joint_part['face'], 2].copy() /
                          (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
 
-        coord_valid = coord_valid.reshape((-1, 1))
-        keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid],
-                                      axis=1)
-        keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1)
+        keypoints_2d = joint_img[:, :2].copy()
+        keypoints_3d = joint_img.copy()
+        keypoints_valid = coord_valid.reshape((-1, 1))
+
+        # map to COCO keypoint order
+        coco_keypoint_names = [
+            'Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder',
+            'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip',
+            'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle'
+        ]
+        coco_keypoint_idx = [
+            human_model.joints_name.index(name) for name in coco_keypoint_names
+        ]
+        coco_3d_keypoints = keypoints_3d[coco_keypoint_idx, :]
+        coco_2d_keypoints = keypoints_2d[coco_keypoint_idx, :]
+        coco_keypoint_valid = keypoints_valid[coco_keypoint_idx, :]
+
+        # map to H36M keypoint order
+        h36m_keypoint_names = [
+            'Pelvis', 'R_Hip', 'R_Knee', 'R_Ankle', 'L_Hip', 'L_Knee',
+            'L_Ankle', 'Neck', 'L_Shoulder', 'L_Elbow', 'L_Wrist',
+            'R_Shoulder', 'R_Elbow', 'R_Wrist'
+        ]
+        # interplate 'spine' and 'neck_base'
+        pelvis_id = human_model.joints_name.index('Pelvis')
+        neck_id = human_model.joints_name.index('Neck')
+        nose_id = human_model.joints_name.index('Nose')
+        spine_3d = (keypoints_3d[pelvis_id, :] + keypoints_3d[neck_id, :]) / 2
+        neck_base_3d = (keypoints_3d[neck_id, :] +
+                        keypoints_3d[nose_id, :]) / 2
+        spine_valid = keypoints_valid[pelvis_id] * keypoints_valid[neck_id]
+        neck_base_valid = keypoints_valid[neck_id] * keypoints_valid[nose_id]
+
+        h36m_keypoint_idx = [
+            human_model.joints_name.index(name) for name in h36m_keypoint_names
+        ]
+        h36m_3d_keypoints = keypoints_3d[h36m_keypoint_idx, :]
+        h36m_2d_keypoints = keypoints_2d[h36m_keypoint_idx, :]
+        h36m_3d_keypoints = np.concatenate([
+            h36m_3d_keypoints[:7, :], spine_3d, h36m_3d_keypoints[7, :],
+            neck_base_3d, h36m_3d_keypoints[8:]
+        ],
+                                           axis=0)
+        h36m_2d_keypoints = np.concatenate([
+            h36m_2d_keypoints[:7, :], spine_3d, h36m_2d_keypoints[7, :],
+            neck_base_3d, h36m_3d_keypoints[8:]
+        ],
+                                           axis=0)
+        h36m_keypoint_valid = keypoints_valid[h36m_keypoint_idx]
+        h36m_keypoint_valid = np.concatenate([
+            h36m_keypoint_valid[:7], spine_valid, h36m_keypoint_valid[7],
+            neck_base_valid, h36m_keypoint_valid[8:]
+        ],
+                                             axis=0)
+
         ann['keypoints'] = keypoints_2d.tolist()
         ann['keypoints_3d'] = keypoints_3d.tolist()
+        ann['keypoints_valid'] = keypoints_valid.tolist()
+        ann['coco_keypoints'] = coco_2d_keypoints.tolist()
+        ann['coco_keypoints_3d'] = coco_3d_keypoints.tolist()
+        ann['coco_keypoints_valid'] = coco_keypoint_valid.tolist()
+        ann['h36m_keypoints'] = h36m_2d_keypoints.tolist()
+        ann['h36m_keypoints_3d'] = h36m_3d_keypoints.tolist()
+        ann['h36m_keypoints_valid'] = h36m_keypoint_valid.tolist()
         img['file_name'] = os.path.join(scene, file_name)
         if video_name in splits:
             val_annos.append(ann)

From 7f9a414aac79f2b62429bdc917f8dd18721c649d Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 11:29:00 +0800
Subject: [PATCH 11/21] --other=refactor dataset

---
 ...ose-lift_motionbert-ft_8xb32-120e_ubody.py | 192 ++++++++++++++++++
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py |   0
 configs/wholebody_3d_keypoint/README.md       |   0
 .../datasets/base/base_mocap_dataset.py       |  16 +-
 .../datasets/body3d/ubody3d_dataset.py        | 190 ++++++++++++++---
 .../dataset_converters/ubody_smplx_to_coco.py |  59 +-----
 6 files changed, 368 insertions(+), 89 deletions(-)
 create mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
 rename configs/{wholebody_3d_keypoint => body_3d_keypoint/pose_lift}/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py (100%)
 delete mode 100644 configs/wholebody_3d_keypoint/README.md

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
new file mode 100644
index 0000000000..900922b87c
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
@@ -0,0 +1,192 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=120,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+        'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'UBody3dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+# mapping
+ubody_h36m = [
+    (0, 0),
+    (2, 1),
+    (4, 2),
+    (6, 3),
+    (1, 4),
+    (3, 5),
+    (5, 6),
+    ((0, 7), 7),
+    (7, 8),
+    ((7, 24), 9),
+    (24, 10),
+    (8, 11),
+    (10, 12),
+    (12, 13),
+    (9, 14),
+    (11, 15),
+    (13, 16),
+]
+
+scenes = [
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = []
+val_datasets = []
+
+for scene in scenes:
+    train_dataset = dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json',
+        seq_len=1,
+        multiple_target=120,
+        multiple_target_step=60,
+        data_prefix=dict(img='images/'),
+        pipeline=[
+            dict(
+                type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m)
+        ],
+    )
+    if scene in ['Speech', 'Movie']:
+        continue
+    val_dataset = dict(
+        type=dataset_type,
+        ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=[
+            dict(
+                type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m)
+        ],
+        test_mode=True,
+    )
+    train_datasets.append(train_dataset)
+    val_datasets.append(val_dataset)
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        datasets=train_datasets,
+        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
+        pipeline=train_pipeline,
+        test_mode=False))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
+        datasets=val_datasets,
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+    dict(type='SimpleMPJPE', mode='mpjpe'),
+    dict(type='SimpleMPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
similarity index 100%
rename from configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
rename to configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
diff --git a/configs/wholebody_3d_keypoint/README.md b/configs/wholebody_3d_keypoint/README.md
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py
index 290edafed0..b06d934ac5 100644
--- a/mmpose/datasets/datasets/base/base_mocap_dataset.py
+++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py
@@ -96,8 +96,7 @@ def __init__(self,
         assert exists(_ann_file), (
             f'Annotation file `{_ann_file}` does not exist.')
 
-        with get_local_path(_ann_file) as local_path:
-            self.ann_data = np.load(local_path)
+        self._load_ann_file(_ann_file)
 
         self.camera_param_file = camera_param_file
         if self.camera_param_file:
@@ -137,6 +136,19 @@ def __init__(self,
             lazy_init=lazy_init,
             max_refetch=max_refetch)
 
+    def _load_ann_file(self, ann_file: str) -> dict:
+        """Load annotation file to get image information.
+
+        Args:
+            ann_file (str): Annotation file path.
+
+        Returns:
+            dict: Annotation information.
+        """
+
+        with get_local_path(ann_file) as local_path:
+            self.ann_data = np.load(local_path)
+
     @classmethod
     def _load_metainfo(cls, metainfo: dict = None) -> dict:
         """Collect meta information from the dictionary of meta.
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
index 468553abf5..236bc380ad 100644
--- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -1,17 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from typing import Optional
+from collections import defaultdict
+from typing import List, Optional, Tuple
 
 import numpy as np
-from mmengine.fileio import exists, get_local_path
+from mmengine.fileio import get_local_path
 from xtcocotools.coco import COCO
 
-from mmpose.datasets.datasets import BaseCocoStyleDataset
+from mmpose.datasets.datasets import BaseMocapDataset
 from mmpose.registry import DATASETS
 
 
 @DATASETS.register_module()
-class UBody3dDataset(BaseCocoStyleDataset):
+class UBody3dDataset(BaseMocapDataset):
     """Ubody3d dataset for 3D human pose estimation.
 
     "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer",
@@ -28,11 +29,15 @@ class UBody3dDataset(BaseCocoStyleDataset):
 
     Args:
         ann_file (str): Annotation file path. Default: ''.
-        bbox_file (str, optional): Detection result file path. If
-            ``bbox_file`` is set, detected bboxes loaded from this file will
-            be used instead of ground-truth bboxes. This setting is only for
-            evaluation, i.e., ignored when ``test_mode`` is ``False``.
-            Default: ``None``.
+        seq_len (int): Number of frames in a sequence. Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
+        causal (bool): If set to ``True``, the rightmost input frame will be
+            the target frame. Otherwise, the middle input frame will be the
+            target frame. Default: ``True``.
+        subset_frac (float): The fraction to reduce dataset size. If set to 1,
+            the dataset size is not reduced. Default: 1.
+        camera_param_file (str): Cameras' parameters file. Default: ``None``.
         data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
             ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
             one instance; while in ``'bottomup'`` mode, each data sample
@@ -62,31 +67,162 @@ class UBody3dDataset(BaseCocoStyleDataset):
         max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
             None img. The maximum extra number of cycles to get a valid
             image. Default: 1000.
-        sample_interval (int, optional): The sample interval of the dataset.
-            Default: 1.
     """
 
+    def __init__(self,
+                 multiple_target: int = 0,
+                 multiple_target_step: int = 0,
+                 seq_step: int = 1,
+                 pad_video_seq: bool = False,
+                 **kwargs):
+        self.seq_step = seq_step
+        self.pad_video_seq = pad_video_seq
+
+        if multiple_target > 0 and multiple_target_step == 0:
+            multiple_target_step = multiple_target
+        self.multiple_target_step = multiple_target_step
+
+        super().__init__(multiple_target=multiple_target, **kwargs)
+
     METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py')
 
-    def _load_annotations(self):
-        """Load data from annotations in COCO format."""
+    def _load_ann_file(self, ann_file: str) -> dict:
+        """Load annotation file."""
+        with get_local_path(ann_file) as local_path:
+            self.ann_data = COCO(local_path)
+
+    def get_sequence_indices(self) -> List[List[int]]:
+        video_frames = defaultdict(list)
+        img_ids = self.ann_data.getImgIds()
+        for img_id in img_ids:
+            img_info = self.ann_data.loadImgs(img_id)[0]
+            subj, _, _ = self._parse_image_name(img_info['file_name'])
+            video_frames[subj].append(img_id)
+
+        sequence_indices = []
+        _len = (self.seq_len - 1) * self.seq_step + 1
+        _step = self.seq_step
 
-        assert exists(self.ann_file), (
-            f'Annotation file `{self.ann_file}`does not exist')
+        if self.multiple_target:
+            for _, _img_ids in sorted(video_frames.items()):
+                n_frame = len(_img_ids)
+                _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids)
+                seqs_from_video = [
+                    _ann_ids[i:(i + self.multiple_target):_step]
+                    for i in range(0, n_frame, self.multiple_target_step)
+                ][:(n_frame + self.multiple_target_step -
+                    self.multiple_target) // self.multiple_target_step]
+                sequence_indices.extend(seqs_from_video)
+        else:
+            for _, _img_ids in sorted(video_frames.items()):
+                n_frame = len(_img_ids)
+                _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids)
+                if self.pad_video_seq:
+                    # Pad the sequence so that every frame in the sequence will
+                    # be predicted.
+                    if self.causal:
+                        frames_left = self.seq_len - 1
+                        frames_right = 0
+                    else:
+                        frames_left = (self.seq_len - 1) // 2
+                        frames_right = frames_left
+                    for i in range(n_frame):
+                        pad_left = max(0, frames_left - i // _step)
+                        pad_right = max(
+                            0, frames_right - (n_frame - 1 - i) // _step)
+                        start = max(i % _step, i - frames_left * _step)
+                        end = min(n_frame - (n_frame - 1 - i) % _step,
+                                  i + frames_right * _step + 1)
+                        sequence_indices.append([_ann_ids[0]] * pad_left +
+                                                _ann_ids[start:end:_step] +
+                                                [_ann_ids[-1]] * pad_right)
+                else:
+                    seqs_from_video = [
+                        _ann_ids[i:(i + _len):_step]
+                        for i in range(0, n_frame - _len + 1, _step)
+                    ]
+                    sequence_indices.extend(seqs_from_video)
+
+        # reduce dataset size if needed
+        subset_size = int(len(sequence_indices) * self.subset_frac)
+        start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
+        end = start + subset_size
+
+        sequence_indices = sequence_indices[start:end]
+
+        return sequence_indices
 
-        with get_local_path(self.ann_file) as local_path:
-            self.coco = COCO(local_path)
-        # set the metainfo about categories, which is a list of dict
-        # and each dict contains the 'id', 'name', etc. about this category
-        self._metainfo['CLASSES'] = self.coco.loadCats(self.coco.getCatIds())
+    def _parse_image_name(self, image_path: str) -> Tuple[str, int]:
+        """Parse image name to get video name and frame index.
+
+        Args:
+            image_name (str): Image name.
+
+        Returns:
+            tuple[str, int]: Video name and frame index.
+        """
+        trim, file_name = image_path.split('/')[2:]
+        frame_id, suffix = file_name.split('.')
+        return trim, frame_id, suffix
+
+    def _load_annotations(self):
+        """Load data from annotations in COCO format."""
+        num_keypoints = self.metainfo['num_keypoints']
+        self._metainfo['CLASSES'] = self.ann_data.loadCats(
+            self.ann_data.getCatIds())
 
         instance_list = []
         image_list = []
 
-        for img_id in self.coco.getImgIds():
-            if img_id % self.sample_interval != 0:
-                continue
-            img = self.coco.loadImgs(img_id)[0]
+        for i, _ann_ids in enumerate(self.sequence_indices):
+            expected_num_frames = self.seq_len
+            if self.multiple_target:
+                expected_num_frames = self.multiple_target
+
+            assert len(_ann_ids) == (expected_num_frames), (
+                f'Expected `frame_ids` == {expected_num_frames}, but '
+                f'got {len(_ann_ids)} ')
+
+            anns = self.ann_data.loadAnns(_ann_ids)
+            imgs = self.ann_data.loadImgs([ann['image_id'] for ann in anns])
+
+            _kpts = np.array([ann['keypoints'] for ann in anns],
+                             dtype=np.float32)
+            _kpts_3d = np.array([ann['keypoints_3d'] for ann in anns],
+                                dtype=np.float32)
+            _keypoints_visible = np.array(
+                [ann['keypoints_valid'] for ann in anns], dtype=np.float32)
+            _camera_params = np.array([ann['camera_param'] for ann in anns])
+            _scales = np.zeros(len(imgs), dtype=np.float32)
+            _centers = np.zeros((len(imgs), 2), dtype=np.float32)
+            _img_paths = np.array([img['file_name'] for img in imgs])
+
+            target_idx = [-1] if self.causal else [int(self.seq_len // 2)]
+            if self.multiple_target:
+                target_idx = list(range(self.multiple_target))
+
+            instance_info = {
+                'num_keypoints': num_keypoints,
+                'keypoints': _kpts,
+                'keypoints_3d': _kpts_3d,
+                'keypoints_visible': _keypoints_visible,
+                'scale': _scales,
+                'center': _centers,
+                'id': i,
+                'category_id': 1,
+                'iscrowd': 0,
+                'img_paths': list(_img_paths),
+                'img_ids': [img['id'] for img in imgs],
+                'lifting_target': _kpts_3d[target_idx],
+                'lifting_target_visible': _keypoints_visible[target_idx],
+                'target_img_paths': _img_paths[target_idx],
+                'camera_param': _camera_params,
+            }
+
+            instance_list.append(instance_info)
+
+        for img_id in self.ann_data.getImgIds():
+            img = self.ann_data.loadImgs(img_id)[0]
             img.update({
                 'img_id':
                 img_id,
@@ -95,12 +231,6 @@ def _load_annotations(self):
             })
             image_list.append(img)
 
-            ann_ids = self.coco.getAnnIds(imgIds=img_id)
-            for ann in self.coco.loadAnns(ann_ids):
-                if instance_info := self.parse_data_info(
-                        dict(raw_ann_info=ann, raw_img_info=img)):
-                    instance_list.append(instance_info)
-
         return instance_list, image_list
 
     def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index a3af141c4d..5b96ee3983 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -359,66 +359,10 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
         keypoints_3d = joint_img.copy()
         keypoints_valid = coord_valid.reshape((-1, 1))
 
-        # map to COCO keypoint order
-        coco_keypoint_names = [
-            'Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder',
-            'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip',
-            'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle'
-        ]
-        coco_keypoint_idx = [
-            human_model.joints_name.index(name) for name in coco_keypoint_names
-        ]
-        coco_3d_keypoints = keypoints_3d[coco_keypoint_idx, :]
-        coco_2d_keypoints = keypoints_2d[coco_keypoint_idx, :]
-        coco_keypoint_valid = keypoints_valid[coco_keypoint_idx, :]
-
-        # map to H36M keypoint order
-        h36m_keypoint_names = [
-            'Pelvis', 'R_Hip', 'R_Knee', 'R_Ankle', 'L_Hip', 'L_Knee',
-            'L_Ankle', 'Neck', 'L_Shoulder', 'L_Elbow', 'L_Wrist',
-            'R_Shoulder', 'R_Elbow', 'R_Wrist'
-        ]
-        # interplate 'spine' and 'neck_base'
-        pelvis_id = human_model.joints_name.index('Pelvis')
-        neck_id = human_model.joints_name.index('Neck')
-        nose_id = human_model.joints_name.index('Nose')
-        spine_3d = (keypoints_3d[pelvis_id, :] + keypoints_3d[neck_id, :]) / 2
-        neck_base_3d = (keypoints_3d[neck_id, :] +
-                        keypoints_3d[nose_id, :]) / 2
-        spine_valid = keypoints_valid[pelvis_id] * keypoints_valid[neck_id]
-        neck_base_valid = keypoints_valid[neck_id] * keypoints_valid[nose_id]
-
-        h36m_keypoint_idx = [
-            human_model.joints_name.index(name) for name in h36m_keypoint_names
-        ]
-        h36m_3d_keypoints = keypoints_3d[h36m_keypoint_idx, :]
-        h36m_2d_keypoints = keypoints_2d[h36m_keypoint_idx, :]
-        h36m_3d_keypoints = np.concatenate([
-            h36m_3d_keypoints[:7, :], spine_3d, h36m_3d_keypoints[7, :],
-            neck_base_3d, h36m_3d_keypoints[8:]
-        ],
-                                           axis=0)
-        h36m_2d_keypoints = np.concatenate([
-            h36m_2d_keypoints[:7, :], spine_3d, h36m_2d_keypoints[7, :],
-            neck_base_3d, h36m_3d_keypoints[8:]
-        ],
-                                           axis=0)
-        h36m_keypoint_valid = keypoints_valid[h36m_keypoint_idx]
-        h36m_keypoint_valid = np.concatenate([
-            h36m_keypoint_valid[:7], spine_valid, h36m_keypoint_valid[7],
-            neck_base_valid, h36m_keypoint_valid[8:]
-        ],
-                                             axis=0)
-
         ann['keypoints'] = keypoints_2d.tolist()
         ann['keypoints_3d'] = keypoints_3d.tolist()
         ann['keypoints_valid'] = keypoints_valid.tolist()
-        ann['coco_keypoints'] = coco_2d_keypoints.tolist()
-        ann['coco_keypoints_3d'] = coco_3d_keypoints.tolist()
-        ann['coco_keypoints_valid'] = coco_keypoint_valid.tolist()
-        ann['h36m_keypoints'] = h36m_2d_keypoints.tolist()
-        ann['h36m_keypoints_3d'] = h36m_3d_keypoints.tolist()
-        ann['h36m_keypoints_valid'] = h36m_keypoint_valid.tolist()
+        ann['camera_param'] = cam_param
         img['file_name'] = os.path.join(scene, file_name)
         if video_name in splits:
             val_annos.append(ann)
@@ -466,6 +410,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
     annotation_path = f'{args.data_root}/annotations'
 
     folders = os.listdir(annotation_path)
+    folders = [f for f in folders if osp.isdir(osp.join(annotation_path, f))]
     human_model_path = args.human_model_path
     splits = np.load(split_path)
 

From fcca1545c92cf845072b1e1fe5494641425159eb Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 11:30:51 +0800
Subject: [PATCH 12/21] minor change

---
 tools/dataset_converters/ubody_smplx_to_coco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py
index 5b96ee3983..16f827fce1 100644
--- a/tools/dataset_converters/ubody_smplx_to_coco.py
+++ b/tools/dataset_converters/ubody_smplx_to_coco.py
@@ -244,7 +244,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
         if not os.path.exists(img_path):
             progress_bar.update()
             continue
-        if aid not in annos:
+        if str(aid) not in annos:
             progress_bar.update()
             continue
 

From 1edad2c4b36d2e247357a6ff5e5ac93c25397d62 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 15:39:03 +0800
Subject: [PATCH 13/21] --other=fix transform

---
 ...ose-lift_motionbert-ft_8xb32-120e_ubody.py |  20 ++-
 .../datasets/body3d/ubody3d_dataset.py        | 134 +++++-------------
 mmpose/datasets/transforms/converting.py      |  26 ++++
 3 files changed, 71 insertions(+), 109 deletions(-)

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
index 900922b87c..fbf27004ba 100644
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
+++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
@@ -91,9 +91,11 @@
 ]
 
 scenes = [
-    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
-    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
-    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+    'Magic_show',
+    'Entertainment',
+    # 'ConductMusic', 'Online_class', 'TalkShow',
+    # 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    # 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
 ]
 
 train_datasets = []
@@ -108,10 +110,7 @@
         multiple_target=120,
         multiple_target_step=60,
         data_prefix=dict(img='images/'),
-        pipeline=[
-            dict(
-                type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m)
-        ],
+        pipeline=[],
     )
     if scene in ['Speech', 'Movie']:
         continue
@@ -123,10 +122,7 @@
         multiple_target=243,
         data_root=data_root,
         data_prefix=dict(img='images/'),
-        pipeline=[
-            dict(
-                type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m)
-        ],
+        pipeline=[],
         test_mode=True,
     )
     train_datasets.append(train_dataset)
@@ -134,6 +130,7 @@
 
 # pipelines
 train_pipeline = [
+    dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m),
     dict(type='GenerateTarget', encoder=train_codec),
     dict(
         type='RandomFlipAroundRoot',
@@ -146,6 +143,7 @@
                    'factor', 'camera_param'))
 ]
 val_pipeline = [
+    dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m),
     dict(type='GenerateTarget', encoder=val_codec),
     dict(
         type='PackPoseInputs',
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
index 236bc380ad..903f639c6e 100644
--- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 from collections import defaultdict
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import numpy as np
 from mmengine.fileio import get_local_path
@@ -161,7 +161,7 @@ def _parse_image_name(self, image_path: str) -> Tuple[str, int]:
         Returns:
             tuple[str, int]: Video name and frame index.
         """
-        trim, file_name = image_path.split('/')[2:]
+        trim, file_name = image_path.split('/')[-2:]
         frame_id, suffix = file_name.split('.')
         return trim, frame_id, suffix
 
@@ -184,39 +184,51 @@ def _load_annotations(self):
                 f'got {len(_ann_ids)} ')
 
             anns = self.ann_data.loadAnns(_ann_ids)
-            imgs = self.ann_data.loadImgs([ann['image_id'] for ann in anns])
-
-            _kpts = np.array([ann['keypoints'] for ann in anns],
-                             dtype=np.float32)
-            _kpts_3d = np.array([ann['keypoints_3d'] for ann in anns],
-                                dtype=np.float32)
-            _keypoints_visible = np.array(
-                [ann['keypoints_valid'] for ann in anns], dtype=np.float32)
-            _camera_params = np.array([ann['camera_param'] for ann in anns])
-            _scales = np.zeros(len(imgs), dtype=np.float32)
-            _centers = np.zeros((len(imgs), 2), dtype=np.float32)
-            _img_paths = np.array([img['file_name'] for img in imgs])
+            img_ids = []
+            kpts = np.zeros((len(anns), num_keypoints, 2), dtype=np.float32)
+            kpts_3d = np.zeros((len(anns), num_keypoints, 3), dtype=np.float32)
+            keypoints_visible = np.zeros((len(anns), num_keypoints, 1),
+                                         dtype=np.float32)
+            for j, ann in enumerate(anns):
+                img_ids.append(ann['image_id'])
+                kpts[j] = np.array(ann['keypoints'], dtype=np.float32)
+                kpts_3d[j] = np.array(ann['keypoints_3d'], dtype=np.float32)
+                keypoints_visible[j] = np.array(
+                    ann['keypoints_valid'], dtype=np.float32)
+            imgs = self.ann_data.loadImgs(img_ids)
+            keypoints_visible = keypoints_visible.squeeze(-1)
+
+            scales = np.zeros(len(imgs), dtype=np.float32)
+            centers = np.zeros((len(imgs), 2), dtype=np.float32)
+            img_paths = np.array([img['file_name'] for img in imgs])
+            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
 
             target_idx = [-1] if self.causal else [int(self.seq_len // 2)]
             if self.multiple_target:
                 target_idx = list(range(self.multiple_target))
 
+            cam_param = anns[-1]['camera_param']
+            if 'w' not in cam_param or 'h' not in cam_param:
+                cam_param['w'] = 1000
+                cam_param['h'] = 1000
+
             instance_info = {
                 'num_keypoints': num_keypoints,
-                'keypoints': _kpts,
-                'keypoints_3d': _kpts_3d,
-                'keypoints_visible': _keypoints_visible,
-                'scale': _scales,
-                'center': _centers,
+                'keypoints': kpts,
+                'keypoints_3d': kpts_3d,
+                'keypoints_visible': keypoints_visible,
+                'scale': scales,
+                'center': centers,
                 'id': i,
                 'category_id': 1,
                 'iscrowd': 0,
-                'img_paths': list(_img_paths),
+                'img_paths': list(img_paths),
                 'img_ids': [img['id'] for img in imgs],
-                'lifting_target': _kpts_3d[target_idx],
-                'lifting_target_visible': _keypoints_visible[target_idx],
-                'target_img_paths': _img_paths[target_idx],
-                'camera_param': _camera_params,
+                'lifting_target': kpts_3d[target_idx],
+                'lifting_target_visible': keypoints_visible[target_idx],
+                'target_img_paths': img_paths[target_idx],
+                'camera_param': cam_param,
+                'factor': factors
             }
 
             instance_list.append(instance_info)
@@ -232,77 +244,3 @@ def _load_annotations(self):
             image_list.append(img)
 
         return instance_list, image_list
-
-    def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
-        """Parse raw COCO annotation of an instance.
-
-        Args:
-            raw_data_info (dict): Raw data information loaded from
-                ``ann_file``. It should have following contents:
-
-                - ``'raw_ann_info'``: Raw annotation of an instance
-                - ``'raw_img_info'``: Raw information of the image that
-                    contains the instance
-
-        Returns:
-                dict | None: Parsed instance annotation
-        """
-
-        ann = raw_data_info['raw_ann_info']
-        if 'bbox' not in ann or 'keypoints_3d' not in ann:
-            return None
-
-        img = raw_data_info['raw_img_info']
-        img_w, img_h = img['width'], img['height']
-
-        # get bbox in shape [1, 4], formatted as xywh
-        x, y, w, h = ann['bbox']
-        x1 = np.clip(x, 0, img_w - 1)
-        y1 = np.clip(y, 0, img_h - 1)
-        x2 = np.clip(x + w, 0, img_w - 1)
-        y2 = np.clip(y + h, 0, img_h - 1)
-
-        bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4)
-
-        # keypoints in shape [1, K, 2] and keypoints_visible in [1, K]
-        _keypoints = np.array(
-            ann['keypoints'], dtype=np.float32).reshape(1, -1, 3)
-        keypoints = _keypoints[..., :2]
-        keypoints_visible = np.minimum(1, _keypoints[..., 2])
-
-        _keypoints_3d = np.array(
-            ann['keypoints_3d'], dtype=np.float32).reshape(1, -1, 4)
-        keypoints_3d = _keypoints_3d[..., :3]
-        keypoints_3d_visible = keypoints_visible
-
-        if 'num_keypoints' in ann:
-            num_keypoints = ann['num_keypoints']
-        else:
-            num_keypoints = np.count_nonzero(keypoints.max(axis=2))
-
-        scale = ann.get('scale', 0.0)
-        center = ann.get('center', np.array([0.0, 0.0]))
-
-        instance_info = {
-            'num_keypoints': num_keypoints,
-            'keypoints': keypoints,
-            'keypoints_visible': keypoints_visible,
-            'keypoints_3d': keypoints_3d,
-            'keypoints_3d_visible': keypoints_3d_visible,
-            'bbox': bbox,
-            'bbox_score': np.ones(1, dtype=np.float32),
-            'scale': scale,
-            'center': center,
-            'id': ann['id'],
-            'category_id': 1,
-            'iscrowd': ann.get('iscrowd', 0),
-            'segmentation': ann.get('segmentation', None),
-            'img_path': img['img_path'],
-            'img_id': ann['image_id'],
-            'lifting_target': keypoints_3d[[-1]],
-            'lifting_target_visible': keypoints_3d_visible[[-1]],
-            'target_img_path': img['img_path'],
-        }
-        if 'crowdIndex' in img:
-            instance_info['crowd_index'] = img['crowdIndex']
-        return instance_info
diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index c8204ac7ef..3d201b87e2 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -93,6 +93,9 @@ def transform(self, results: dict) -> dict:
         # Initialize output arrays
         keypoints = np.zeros((num_instances, self.num_keypoints, 2))
         keypoints_visible = np.zeros((num_instances, self.num_keypoints))
+        keypoints_3d = None
+        if 'keypoints_3d' in results:
+            keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3))
 
         # Create a mask to weight visibility loss
         keypoints_visible_weights = keypoints_visible.copy()
@@ -108,6 +111,11 @@ def transform(self, results: dict) -> dict:
                 'keypoints_visible'][:, self.source_index] * \
                 results['keypoints_visible'][:, self.source_index2]
 
+            if 'keypoints_3d' in results:
+                keypoints_3d[:, self.target_index] = 0.5 * (
+                    results['keypoints_3d'][:, self.source_index] +
+                    results['keypoints_3d'][:, self.source_index2])
+
         # Otherwise just copy from the source index
         else:
             keypoints[:,
@@ -115,11 +123,29 @@ def transform(self, results: dict) -> dict:
                                                                 source_index]
             keypoints_visible[:, self.target_index] = results[
                 'keypoints_visible'][:, self.source_index]
+            if 'keypoints_3d' in results:
+                keypoints_3d[:, self.target_index] = results[
+                    'keypoints_3d'][:, self.source_index]
 
         # Update the results dict
         results['keypoints'] = keypoints
         results['keypoints_visible'] = np.stack(
             [keypoints_visible, keypoints_visible_weights], axis=2)
+        if 'keypoints_3d' in results:
+            results['keypoints_3d'] = keypoints_3d
+
+        # Updatae flip pairs
+        if 'flip_indices' in results:
+            flip_indices = []
+            for i in range(len(self.target_index)):
+                x1, x2 = self.source_index[i], self.source_index2[i]
+                if x1 == x2:
+                    flip_id = results['flip_indices'][x1]
+                    flip_id = flip_id if flip_id < self.num_keypoints else i
+                    flip_indices.append(flip_id)
+                else:
+                    flip_indices.append(i)
+            results['flip_indices'] = flip_indices
         return results
 
     def transform_sigmas(self, sigmas: Union[List, np.ndarray]):

From eba2fa79aef42b9c2b7be050e1b67347fe68c327 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 16:48:24 +0800
Subject: [PATCH 14/21] --fix=fix converting

---
 .../pose-lift_motionbert-ft_8xb32-120e_ubody.py      |  8 +++-----
 mmpose/datasets/datasets/body3d/ubody3d_dataset.py   |  3 ++-
 mmpose/datasets/transforms/converting.py             | 12 +++++++++---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
index fbf27004ba..af5c9b85ab 100644
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
+++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
@@ -91,11 +91,9 @@
 ]
 
 scenes = [
-    'Magic_show',
-    'Entertainment',
-    # 'ConductMusic', 'Online_class', 'TalkShow',
-    # 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
-    # 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
 ]
 
 train_datasets = []
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
index 903f639c6e..85b8d893e7 100644
--- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -228,7 +228,8 @@ def _load_annotations(self):
                 'lifting_target_visible': keypoints_visible[target_idx],
                 'target_img_paths': img_paths[target_idx],
                 'camera_param': cam_param,
-                'factor': factors
+                'factor': factors,
+                'target_idx': target_idx,
             }
 
             instance_list.append(instance_info)
diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index 3d201b87e2..457234c4ee 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -91,11 +91,14 @@ def transform(self, results: dict) -> dict:
         num_instances = results['keypoints'].shape[0]
 
         # Initialize output arrays
-        keypoints = np.zeros((num_instances, self.num_keypoints, 2))
-        keypoints_visible = np.zeros((num_instances, self.num_keypoints))
+        keypoints = np.zeros((num_instances, self.num_keypoints, 2),
+                             dtype=np.float32)
+        keypoints_visible = np.zeros((num_instances, self.num_keypoints),
+                                     dtype=np.float32)
         keypoints_3d = None
         if 'keypoints_3d' in results:
-            keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3))
+            keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3),
+                                    dtype=np.float32)
 
         # Create a mask to weight visibility loss
         keypoints_visible_weights = keypoints_visible.copy()
@@ -133,6 +136,9 @@ def transform(self, results: dict) -> dict:
             [keypoints_visible, keypoints_visible_weights], axis=2)
         if 'keypoints_3d' in results:
             results['keypoints_3d'] = keypoints_3d
+            results['lifting_target'] = keypoints_3d[results['target_idx']]
+            results['lifting_target_visible'] = keypoints_visible[
+                results['target_idx']]
 
         # Updatae flip pairs
         if 'flip_indices' in results:

From 79108a607c3dd765023634cbd1bfbfaf33012d70 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 16:54:52 +0800
Subject: [PATCH 15/21] minor change

---
 .../pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
index af5c9b85ab..16546693c9 100644
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
+++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
@@ -117,7 +117,7 @@
         ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json',
         seq_len=1,
         seq_step=1,
-        multiple_target=243,
+        multiple_target=120,
         data_root=data_root,
         data_prefix=dict(img='images/'),
         pipeline=[],

From 3121d862b5c3009179e98bc16a074f2db1f210bd Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 19:47:27 +0800
Subject: [PATCH 16/21] --other=rm simplebaseline config

---
 ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 130 ------------------
 1 file changed, 130 deletions(-)
 delete mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
deleted file mode 100644
index 751905b79e..0000000000
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py
+++ /dev/null
@@ -1,130 +0,0 @@
-_base_ = ['../../_base_/default_runtime.py']
-
-vis_backends = [
-    dict(type='LocalVisBackend'),
-]
-visualizer = dict(
-    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-
-# runtime
-train_cfg = dict(max_epochs=200, val_interval=10)
-
-# optimizer
-optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
-
-# learning policy
-param_scheduler = [
-    dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False)
-]
-
-auto_scale_lr = dict(base_batch_size=512)
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(
-        type='CheckpointHook',
-        save_best='MPJPE',
-        rule='less',
-        max_keep_ckpts=1))
-
-# codec settings
-codec = dict(
-    type='ImagePoseLifting', num_keypoints=137, root_index=0, remove_root=True)
-
-# model settings
-model = dict(
-    type='PoseLifter',
-    backbone=dict(
-        type='TCN',
-        in_channels=2 * 137,
-        stem_channels=1024,
-        num_blocks=2,
-        kernel_sizes=(1, 1, 1),
-        dropout=0.5,
-    ),
-    head=dict(
-        type='TemporalRegressionHead',
-        in_channels=1024,
-        num_joints=136,
-        loss=dict(type='MSELoss'),
-        decoder=codec,
-    ))
-
-# base dataset settings
-dataset_type = 'UBody3dDataset'
-data_mode = 'topdown'
-data_root = 'data/UBody/'
-
-scenes = [
-    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
-    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
-    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
-]
-
-train_datasets = []
-val_datasets = []
-
-for scene in scenes:
-    train_dataset = dict(
-        type=dataset_type,
-        data_root=data_root,
-        data_mode=data_mode,
-        ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json',
-        seq_len=1,
-        causal=True,
-        keypoint_2d_src='gt',
-        data_prefix=dict(img='images/'),
-        pipeline=[])
-    val_dataset = dict(
-        type=dataset_type,
-        data_root=data_root,
-        data_mode=data_mode,
-        ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json',
-        data_prefix=dict(img='images/'),
-        pipeline=[])
-    train_datasets.append(train_dataset)
-    val_datasets.append(val_dataset)
-
-# pipelines
-train_pipeline = [
-    dict(type='GenerateTarget', encoder=codec),
-    dict(
-        type='PackPoseInputs',
-        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
-                   'target_root', 'target_root_index'))
-]
-val_pipeline = train_pipeline
-
-# data loaders
-train_dataloader = dict(
-    batch_size=64,
-    num_workers=2,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='CombinedDataset',
-        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
-        datasets=train_datasets,
-        pipeline=train_pipeline,
-        test_mode=False,
-    ))
-val_dataloader = dict(
-    batch_size=64,
-    num_workers=2,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='CombinedDataset',
-        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
-        datasets=val_datasets,
-        pipeline=val_pipeline,
-        test_mode=True,
-    ))
-test_dataloader = val_dataloader
-
-# evaluators
-val_evaluator = [
-    dict(type='SimpleMPJPE', mode='mpjpe'),
-    dict(type='SimpleMPJPE', mode='p-mpjpe')
-]
-test_evaluator = val_evaluator

From 84ae4cd34ffe6e05e2559619da5dcf2a0151d8b8 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 18 Sep 2023 20:23:41 +0800
Subject: [PATCH 17/21] --test=add ut

---
 tests/data/ubody3d/ubody3d_train.json         |  1 +
 .../test_body_datasets/test_ubody_dataset.py  | 77 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 tests/data/ubody3d/ubody3d_train.json
 create mode 100644 tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py

diff --git a/tests/data/ubody3d/ubody3d_train.json b/tests/data/ubody3d/ubody3d_train.json
new file mode 100644
index 0000000000..55a4ac5226
--- /dev/null
+++ b/tests/data/ubody3d/ubody3d_train.json
@@ -0,0 +1 @@
+{"images": [{"id": 15, "height": 720, "width": 1280, "file_name": "Magic_show/Magic_show_S1_Trim1/Magic_show_S1_Trim1/000016.png"}], "annotations": [{"id": 0, "image_id": 15, "bbox": [74.55498504638672, 8.571063995361328, 1062.4967727661133, 701.8491630554199], "segmentation": [[]], "area": 0, "iscrowd": 0, "category_id": 1, "score": 1, "person_id": 0, "hand_box1": [336.4236145019531, 321.40362548828125, 473.6637268066406, 452.62567138671875], "hand_box2": [699.218994140625, 50.335018157958984, 533.58251953125, 621.6577186584473], "keypoints": [[585.656005859375, 1398.5216064453125], [699.9061889648438, 1586.966064453125], [450.14288330078125, 1596.144775390625], [878.3228149414062, 2171.27783203125], [252.16543579101562, 2132.398681640625], [793.895263671875, 2988.90771484375], [232.56475830078125, 2939.503173828125], [588.2872314453125, 570.474365234375], [862.1456298828125, 514.33837890625], [373.89849853515625, 519.60888671875], [1073.739990234375, 765.0070190429688], [89.8785400390625, 775.919921875], [1000.2418212890625, 635.8955688476562], [189.44015502929688, 567.993408203125], [891.81298828125, 2948.2041015625], [1013.4824829101562, 3015.250732421875], [819.24658203125, 3122.821533203125], [172.14041137695312, 2868.272705078125], [31.46063232421875, 2937.01025390625], [244.37692260742188, 3111.135009765625], [760.2764282226562, 235.35623168945312], [469.04644775390625, 237.359130859375], [672.689453125, 216.68638610839844], [536.8645629882812, 215.08010864257812], [594.4747924804688, 302.86590576171875], [937.543212890625, 563.2012939453125], [877.2040405273438, 564.7064819335938], [826.8228759765625, 548.8115234375], [768.3922729492188, 532.2924194335938], [945.0330810546875, 433.25579833984375], [887.2977905273438, 411.39129638671875], [854.9716796875, 409.1885986328125], [812.5216064453125, 409.8503112792969], [993.1986083984375, 415.13519287109375], [983.431640625, 352.09503173828125], [976.8125610351562, 306.58990478515625], [967.6991577148438, 251.8966064453125], [1042.6788330078125, 439.2115783691406], [1061.695068359375, 382.62310791015625], [1078.3428955078125, 336.8554382324219], [1089.8707275390625, 288.113037109375], [1077.3145751953125, 467.8497009277344], [1113.5694580078125, 449.51904296875], [1147.91796875, 434.2681884765625], [1184.372314453125, 406.7205505371094], [262.0787048339844, 512.4108276367188], [314.8291320800781, 495.84429931640625], [355.2375183105469, 463.73870849609375], [400.5841064453125, 429.6348876953125], [290.11627197265625, 385.6371765136719], [334.016357421875, 356.7796325683594], [352.326904296875, 347.6751403808594], [379.92449951171875, 336.6559143066406], [248.99337768554688, 355.2509460449219], [270.441162109375, 294.56085205078125], [283.58990478515625, 247.07943725585938], [298.6072692871094, 191.95077514648438], [194.588623046875, 364.1822509765625], [197.89288330078125, 304.9277038574219], [198.94699096679688, 255.0223846435547], [207.83172607421875, 206.8009490966797], [152.69793701171875, 380.91925048828125], [126.07894897460938, 349.861083984375], [99.02603149414062, 320.67138671875], [75.35498046875, 280.7127380371094], [605.5189819335938, 258.36474609375], [636.6569213867188, 261.03448486328125], [672.689453125, 216.68638610839844], [536.8645629882812, 215.08010864257812], [480.609130859375, 193.2221221923828], [498.7352294921875, 169.0961151123047], [527.0252075195312, 168.48736572265625], [556.564453125, 174.32501220703125], [582.2213134765625, 183.7449188232422], [619.771728515625, 185.09783935546875], [646.1015625, 177.27572631835938], [678.3016357421875, 172.73214721679688], [709.5665283203125, 174.52818298339844], [730.6221313476562, 199.52928161621094], [600.2632446289062, 215.79234313964844], [598.0828247070312, 240.45635986328125], [596.2218627929688, 264.4862976074219], [594.4674072265625, 287.62481689453125], [572.7188110351562, 305.8975830078125], [583.9725341796875, 311.3199157714844], [596.401123046875, 315.5985107421875], [609.6165771484375, 311.5094909667969], [622.2186279296875, 306.6711120605469], [512.6423950195312, 211.75982666015625], [528.5633544921875, 204.07089233398438], [548.4610595703125, 205.9830780029297], [565.9568481445312, 217.66900634765625], [548.8089599609375, 222.94613647460938], [530.2134399414062, 222.75762939453125], [639.6070556640625, 219.82444763183594], [655.8860473632812, 209.6044158935547], [676.3201904296875, 208.3985595703125], [694.9487915039062, 217.1615753173828], [674.3418579101562, 226.85595703125], [655.4156494140625, 225.6745147705078], [551.7490234375, 353.2354736328125], [564.1500244140625, 346.4883728027344], [583.2034912109375, 344.99609375], [595.4065551757812, 347.21868896484375], [607.8397216796875, 345.721435546875], [629.6182250976562, 348.2886047363281], [648.6402587890625, 353.0809631347656], [634.0433349609375, 361.12738037109375], [612.543212890625, 365.1044921875], [598.9017333984375, 366.5699768066406], [585.4385375976562, 366.0231018066406], [566.12353515625, 362.2437744140625], [553.4495239257812, 352.7164001464844], [583.9151000976562, 355.8670654296875], [596.3876342773438, 356.340576171875], [608.99560546875, 356.22100830078125], [648.081787109375, 352.85076904296875], [612.7412719726562, 351.5333251953125], [598.9871215820312, 351.8242492675781], [585.3312377929688, 352.4969482421875], [464.1539001464844, 202.29954528808594], [465.8164978027344, 244.8143768310547], [469.96026611328125, 282.73333740234375], [474.998779296875, 318.5062255859375], [485.900390625, 354.82257080078125], [503.9440002441406, 389.1557922363281], [533.9607543945312, 420.1808776855469], [569.1990356445312, 439.69488525390625], [604.7715454101562, 445.1242370605469], [641.609130859375, 438.5807189941406], [677.1731567382812, 419.1774597167969], [709.558349609375, 390.3476867675781], [728.9358520507812, 358.6229553222656], [743.6824951171875, 323.7010192871094], [752.355224609375, 286.009033203125], [756.031494140625, 248.0742645263672], [756.6275634765625, 206.8378448486328]], "foot_kpts": [1166.72314453125, 38.096336364746094, 0, 1002.4937744140625, 109.48077392578125, 0, 1049.140869140625, 663.1453857421875, 0, 317.3815002441406, 32.0361328125, 0, 402.523681640625, 303.2774963378906, 0, 177.21731567382812, 665.190673828125, 0], "face_kpts": [482.1813659667969, 206.51531982421875, 0, 474.4501037597656, 248.23251342773438, 1, 482.5657043457031, 282.5651550292969, 1, 490.3671569824219, 326.8166198730469, 1, 498.9546813964844, 355.2204895019531, 1, 519.25634765625, 390.5085754394531, 1, 543.9222412109375, 417.4048156738281, 1, 574.4150390625, 437.6228332519531, 1, 614.6944580078125, 442.5209045410156, 1, 648.99267578125, 436.2539978027344, 1, 682.6341552734375, 416.4512023925781, 1, 702.5023193359375, 392.0824279785156, 1, 725.9093017578125, 358.3260803222656, 1, 739.4346923828125, 328.9374084472656, 1, 746.7598876953125, 285.0207824707031, 1, 748.8603515625, 251.59585571289062, 1, 755.915771484375, 212.4534149169922, 0, 496.4743957519531, 188.47494506835938, 1, 514.8231201171875, 177.99856567382812, 1, 535.214111328125, 176.0469970703125, 1, 556.4619140625, 177.9375, 1, 576.8843994140625, 183.35317993164062, 1, 631.4595947265625, 183.65673828125, 1, 652.4815673828125, 180.27340698242188, 1, 676.221923828125, 180.07711791992188, 1, 698.4794921875, 184.41073608398438, 1, 718.5443115234375, 196.21084594726562, 1, 604.396484375, 218.71194458007812, 1, 602.6702880859375, 245.68115234375, 1, 600.9422607421875, 271.4402770996094, 1, 599.4947509765625, 297.5359802246094, 1, 571.33203125, 313.3100891113281, 1, 586.1724853515625, 317.1542663574219, 1, 601.4893798828125, 320.0868835449219, 1, 617.738525390625, 316.9916687011719, 1, 632.822509765625, 313.9440002441406, 1, 524.906005859375, 216.0177001953125, 1, 542.880859375, 206.15841674804688, 1, 563.9365234375, 208.03213500976562, 1, 578.5321044921875, 222.44454956054688, 1, 559.7491455078125, 226.11843872070312, 1, 541.22607421875, 225.11203002929688, 1, 636.491943359375, 223.62353515625, 1, 652.7271728515625, 210.68789672851562, 1, 674.761474609375, 209.86370849609375, 1, 692.972900390625, 221.53323364257812, 1, 674.9864501953125, 228.75543212890625, 1, 656.0750732421875, 229.04306030273438, 1, 560.0743408203125, 351.4398498535156, 1, 577.081787109375, 347.0306091308594, 1, 594.04638671875, 345.2702941894531, 1, 604.1793212890625, 346.1555480957031, 1, 614.151611328125, 344.8525695800781, 1, 634.447509765625, 345.7118225097656, 1, 656.1597900390625, 347.9260559082031, 1, 640.6773681640625, 358.7562561035156, 1, 624.00732421875, 366.7438049316406, 1, 605.445556640625, 369.8896789550781, 1, 588.646484375, 367.5843811035156, 1, 573.5023193359375, 360.9281921386719, 1, 565.385498046875, 352.2278137207031, 1, 585.1085205078125, 353.1212463378906, 1, 604.616943359375, 355.0426330566406, 1, 626.8272705078125, 351.8833312988281, 1, 650.2919921875, 349.2644958496094, 1, 627.5924072265625, 353.0104675292969, 1, 604.7803955078125, 355.8074645996094, 1, 584.6986083984375, 354.2829284667969, 1], "lefthand_kpts": [942.7679443359375, 607.469482421875, 1, 888.291259765625, 539.277587890625, 1, 832.873291015625, 483.5708923339844, 1, 787.126953125, 436.6972351074219, 1, 710.735107421875, 413.7229309082031, 1, 888.9903564453125, 319.5710754394531, 1, 868.0140380859375, 280.7148742675781, 1, 830.3096923828125, 266.0387268066406, 1, 778.9337158203125, 271.2351379394531, 1, 962.7294921875, 272.7072448730469, 1, 955.781005859375, 187.65567016601562, 1, 953.9222412109375, 103.62838745117188, 1, 959.151611328125, 29.267608642578125, 1, 1047.009033203125, 294.3193664550781, 1, 1056.5989990234375, 215.84146118164062, 1, 1066.36865234375, 147.68014526367188, 1, 1081.0699462890625, 65.11972045898438, 1, 1107.0172119140625, 358.7002258300781, 1, 1159.4434814453125, 319.2156677246094, 1, 1206.9718017578125, 272.8797912597656, 1, 1261.1082763671875, 224.43637084960938, 1], "righthand_kpts": [233.142822265625, 582.3209228515625, 1, 300.6414794921875, 508.47479248046875, 1, 362.43896484375, 455.85186767578125, 1, 377.3603515625, 404.19744873046875, 1, 446.76416015625, 377.29241943359375, 1, 342.8802490234375, 310.6497802734375, 1, 368.6904296875, 284.673095703125, 1, 381.802734375, 251.73486328125, 1, 421.5467529296875, 225.363525390625, 1, 283.64288330078125, 254.122802734375, 1, 304.9996337890625, 170.8004150390625, 1, 320.6651611328125, 98.6851806640625, 1, 335.6553955078125, 28.2318115234375, 1, 199.05755615234375, 256.80859375, 1, 206.0360107421875, 177.01025390625, 1, 215.68804931640625, 106.7457275390625, 1, 224.53521728515625, 32.276611328125, 1, 128.827392578125, 294.99359130859375, 1, 99.0606689453125, 239.12982177734375, 1, 65.53125, 189.2431640625, 1, 37.63360595703125, 116.657958984375, 1], "center": [605.8033447265625, 359.4956359863281], "scale": [6.6406049728393555, 8.854140281677246], "keypoints_score": [0.9791078567504883, 0.9932481050491333, 1.0011144876480103, 0.973096489906311, 0.972457766532898, 0.866172194480896, 0.8760361671447754, 0.3526427149772644, 0.3903506398200989, 0.921836793422699, 0.9433825016021729, 0.20496317744255066, 0.2460474669933319, 0.20729553699493408, 0.17142903804779053, 0.18208564817905426, 0.22269707918167114], "face_kpts_score": [0.3680439293384552, 0.5355573892593384, 0.6418813467025757, 0.6644495725631714, 0.7590401768684387, 0.5538617372512817, 0.5907169580459595, 0.5878690481185913, 0.6348617076873779, 0.7361799478530884, 0.6556291580200195, 0.618322491645813, 0.6537319421768188, 0.5892513394355774, 0.7059171199798584, 0.645734429359436, 0.4574907422065735, 0.9639992713928223, 0.9263820648193359, 0.8876979351043701, 0.9284569621086121, 0.9739065170288086, 0.9502178430557251, 0.9174821376800537, 0.918608546257019, 0.9061530232429504, 0.862210750579834, 0.9776759147644043, 0.973875105381012, 0.974762499332428, 0.9565852880477905, 0.9716235399246216, 1.0059518814086914, 0.946382999420166, 0.9594531059265137, 0.9658107757568359, 1.0158061981201172, 0.9708306789398193, 0.9969902634620667, 0.9845597743988037, 0.9349627494812012, 0.9380444288253784, 0.9717998504638672, 0.9871775507926941, 0.9774664640426636, 0.9537898898124695, 0.9465979933738708, 0.9661000967025757, 0.9713011980056763, 0.9717509746551514, 0.956028938293457, 1.000832438468933, 0.9808722734451294, 0.9960898160934448, 0.9364079236984253, 1.0011546611785889, 0.9167187213897705, 0.9541155099868774, 0.9244742393493652, 0.988551139831543, 0.9954862594604492, 0.9832127094268799, 0.978826642036438, 0.9751479625701904, 0.956895112991333, 0.9974040985107422, 0.9864891767501831, 0.9898920655250549], "foot_kpts_score": [0.24755269289016724, 0.1599443256855011, 0.25949808955192566, 0.2688680589199066, 0.14811083674430847, 0.23364056646823883], "lefthand_kpts_score": [0.603957986831665, 0.46176729202270506, 0.5001004695892334, 0.6286116600036621, 0.7983541250228882, 0.7467568874359131, 0.7094749569892883, 0.7889106035232544, 0.8908322811126709, 0.8638974189758301, 1.0441084861755372, 0.9282500505447387, 0.9102095127105713, 0.7738837957382202, 0.94963458776474, 0.8981462478637695, 0.9926700949668884, 0.7828058958053589, 0.9498528003692627, 0.9387582302093506, 0.8471795082092285], "righthand_kpts_score": [0.6722876787185669, 0.60037282705307, 0.5398626983165741, 0.7077780723571777, 0.7050052642822265, 0.6411999225616455, 0.725990629196167, 0.758279001712799, 0.8829087972640991, 0.889958119392395, 0.9569337129592895, 0.9145335912704468, 0.9213766813278198, 0.8925279140472412, 0.9955486416816711, 1.0033048152923585, 1.0014301896095277, 0.9033888339996338, 0.9002806305885315, 0.8902452945709228, 0.888652241230011], "face_box": [445.3220458984375, 145.05938720703125, 348.63178710937495, 332.0302734375], "face_valid": true, "leftfoot_valid": false, "rightfoot_valid": false, "lefthand_valid": true, "righthand_valid": true, "lefthand_box": [699.218994140625, 50.335018157958984, 533.58251953125, 621.6577186584473], "righthand_box": [81.47227172851564, -7.12115478515625, 398.4362548828125, 664.060546875], "lefthand_update": true, "righthand_update": true, "lefthand_kpts_vitposehand": [942.7679443359375, 607.469482421875, 1, 888.291259765625, 539.277587890625, 1, 832.873291015625, 483.5708923339844, 1, 787.126953125, 436.6972351074219, 1, 710.735107421875, 413.7229309082031, 1, 888.9903564453125, 319.5710754394531, 1, 868.0140380859375, 280.7148742675781, 1, 830.3096923828125, 266.0387268066406, 1, 778.9337158203125, 271.2351379394531, 1, 962.7294921875, 272.7072448730469, 1, 955.781005859375, 187.65567016601562, 1, 953.9222412109375, 103.62838745117188, 1, 959.151611328125, 29.267608642578125, 1, 1047.009033203125, 294.3193664550781, 1, 1056.5989990234375, 215.84146118164062, 1, 1066.36865234375, 147.68014526367188, 1, 1081.0699462890625, 65.11972045898438, 1, 1107.0172119140625, 358.7002258300781, 1, 1159.4434814453125, 319.2156677246094, 1, 1206.9718017578125, 272.8797912597656, 1, 1261.1082763671875, 224.43637084960938, 1], "righthand_kpts_vitposehand": [233.142822265625, 582.3209228515625, 1, 300.6414794921875, 508.47479248046875, 1, 362.43896484375, 455.85186767578125, 1, 377.3603515625, 404.19744873046875, 1, 446.76416015625, 377.29241943359375, 1, 342.8802490234375, 310.6497802734375, 1, 368.6904296875, 284.673095703125, 1, 381.802734375, 251.73486328125, 1, 421.5467529296875, 225.363525390625, 1, 283.64288330078125, 254.122802734375, 1, 304.9996337890625, 170.8004150390625, 1, 320.6651611328125, 98.6851806640625, 1, 335.6553955078125, 28.2318115234375, 1, 199.05755615234375, 256.80859375, 1, 206.0360107421875, 177.01025390625, 1, 215.68804931640625, 106.7457275390625, 1, 224.53521728515625, 32.276611328125, 1, 128.827392578125, 294.99359130859375, 1, 99.0606689453125, 239.12982177734375, 1, 65.53125, 189.2431640625, 1, 37.63360595703125, 116.657958984375, 1], "num_keypoints": 9, "full_body": false, "valid_label": 2, "keypoints_3d": [[585.656005859375, 1398.5216064453125, 8.0], [699.9061889648438, 1586.966064453125, 7.7132415771484375], [450.14288330078125, 1596.144775390625, 7.6570892333984375], [878.3228149414062, 2171.27783203125, 5.664215087890625], [252.16543579101562, 2132.398681640625, 5.6501007080078125], [793.895263671875, 2988.90771484375, 4.6084747314453125], [232.56475830078125, 2939.503173828125, 4.28839111328125], [588.2872314453125, 570.474365234375, 9.544265747070312], [862.1456298828125, 514.33837890625, 8.8726806640625], [373.89849853515625, 519.60888671875, 9.171127319335938], [1073.739990234375, 765.0070190429688, 7.1384735107421875], [89.8785400390625, 775.919921875, 7.5379791259765625], [1000.2418212890625, 635.8955688476562, 5.19927978515625], [189.44015502929688, 567.993408203125, 5.757049560546875], [891.81298828125, 2948.2041015625, 3.0384368896484375], [1013.4824829101562, 3015.250732421875, 3.43035888671875], [819.24658203125, 3122.821533203125, 4.943603515625], [172.14041137695312, 2868.272705078125, 2.809112548828125], [31.46063232421875, 2937.01025390625, 3.1867828369140625], [244.37692260742188, 3111.135009765625, 4.5428619384765625], [760.2764282226562, 235.35623168945312, 9.170547485351562], [469.04644775390625, 237.359130859375, 9.270904541015625], [672.689453125, 216.68638610839844, 8.436477661132812], [536.8645629882812, 215.08010864257812, 8.477508544921875], [594.4747924804688, 302.86590576171875, 8.231826782226562], [937.543212890625, 563.2012939453125, 7.81884765625], [877.2040405273438, 564.7064819335938, 7.746490478515625], [826.8228759765625, 548.8115234375, 7.6898651123046875], [768.3922729492188, 532.2924194335938, 7.540069580078125], [945.0330810546875, 433.25579833984375, 7.78143310546875], [887.2977905273438, 411.39129638671875, 7.68023681640625], [854.9716796875, 409.1885986328125, 7.548248291015625], [812.5216064453125, 409.8503112792969, 7.41748046875], [993.1986083984375, 415.13519287109375, 7.762298583984375], [983.431640625, 352.09503173828125, 7.7212677001953125], [976.8125610351562, 306.58990478515625, 7.644317626953125], [967.6991577148438, 251.8966064453125, 7.58074951171875], [1042.6788330078125, 439.2115783691406, 7.7346954345703125], [1061.695068359375, 382.62310791015625, 7.7144622802734375], [1078.3428955078125, 336.8554382324219, 7.6671142578125], [1089.8707275390625, 288.113037109375, 7.64324951171875], [1077.3145751953125, 467.8497009277344, 7.6988525390625], [1113.5694580078125, 449.51904296875, 7.6714019775390625], [1147.91796875, 434.2681884765625, 7.6133880615234375], [1184.372314453125, 406.7205505371094, 7.566802978515625], [262.0787048339844, 512.4108276367188, 7.7939453125], [314.8291320800781, 495.84429931640625, 7.6787109375], [355.2375183105469, 463.73870849609375, 7.6097564697265625], [400.5841064453125, 429.6348876953125, 7.4446563720703125], [290.11627197265625, 385.6371765136719, 7.82208251953125], [334.016357421875, 356.7796325683594, 7.663116455078125], [352.326904296875, 347.6751403808594, 7.499725341796875], [379.92449951171875, 336.6559143066406, 7.330535888671875], [248.99337768554688, 355.2509460449219, 7.84161376953125], [270.441162109375, 294.56085205078125, 7.848602294921875], [283.58990478515625, 247.07943725585938, 7.8173370361328125], [298.6072692871094, 191.95077514648438, 7.8151092529296875], [194.588623046875, 364.1822509765625, 7.8341217041015625], [197.89288330078125, 304.9277038574219, 7.8556976318359375], [198.94699096679688, 255.0223846435547, 7.8529815673828125], [207.83172607421875, 206.8009490966797, 7.8715667724609375], [152.69793701171875, 380.91925048828125, 7.8072052001953125], [126.07894897460938, 349.861083984375, 7.8142547607421875], [99.02603149414062, 320.67138671875, 7.79296875], [75.35498046875, 280.7127380371094, 7.79833984375], [605.5189819335938, 258.36474609375, 7.6539459228515625], [636.6569213867188, 261.03448486328125, 7.6003265380859375], [672.689453125, 216.68638610839844, 6.8922119140625], [536.8645629882812, 215.08010864257812, 6.9332427978515625], [480.609130859375, 193.2221221923828, 7.156890869140625], [498.7352294921875, 169.0961151123047, 7.0008087158203125], [527.0252075195312, 168.48736572265625, 6.879364013671875], [556.564453125, 174.32501220703125, 6.8116912841796875], [582.2213134765625, 183.7449188232422, 6.796417236328125], [619.771728515625, 185.09783935546875, 6.7884368896484375], [646.1015625, 177.27572631835938, 6.788299560546875], [678.3016357421875, 172.73214721679688, 6.8334197998046875], [709.5665283203125, 174.52818298339844, 6.94036865234375], [730.6221313476562, 199.52928161621094, 7.08001708984375], [600.2632446289062, 215.79234313964844, 6.797698974609375], [598.0828247070312, 240.45635986328125, 6.753753662109375], [596.2218627929688, 264.4862976074219, 6.70782470703125], [594.4674072265625, 287.62481689453125, 6.66571044921875], [572.7188110351562, 305.8975830078125, 6.8535308837890625], [583.9725341796875, 311.3199157714844, 6.8229217529296875], [596.401123046875, 315.5985107421875, 6.804962158203125], [609.6165771484375, 311.5094909667969, 6.8159027099609375], [622.2186279296875, 306.6711120605469, 6.8405303955078125], [512.6423950195312, 211.75982666015625, 7.02471923828125], [528.5633544921875, 204.07089233398438, 6.9400634765625], [548.4610595703125, 205.9830780029297, 6.92816162109375], [565.9568481445312, 217.66900634765625, 6.9529266357421875], [548.8089599609375, 222.94613647460938, 6.9491424560546875], [530.2134399414062, 222.75762939453125, 6.9624176025390625], [639.6070556640625, 219.82444763183594, 6.930755615234375], [655.8860473632812, 209.6044158935547, 6.8970184326171875], [676.3201904296875, 208.3985595703125, 6.8957061767578125], [694.9487915039062, 217.1615753173828, 6.9696502685546875], [674.3418579101562, 226.85595703125, 6.9189300537109375], [655.4156494140625, 225.6745147705078, 6.91705322265625], [551.7490234375, 353.2354736328125, 6.971923828125], [564.1500244140625, 346.4883728027344, 6.88177490234375], [583.2034912109375, 344.99609375, 6.8333587646484375], [595.4065551757812, 347.21868896484375, 6.8253173828125], [607.8397216796875, 345.721435546875, 6.82666015625], [629.6182250976562, 348.2886047363281, 6.8668060302734375], [648.6402587890625, 353.0809631347656, 6.940582275390625], [634.0433349609375, 361.12738037109375, 6.8939056396484375], [612.543212890625, 365.1044921875, 6.8557891845703125], [598.9017333984375, 366.5699768066406, 6.8533477783203125], [585.4385375976562, 366.0231018066406, 6.8624725341796875], [566.12353515625, 362.2437744140625, 6.9132232666015625], [553.4495239257812, 352.7164001464844, 6.97503662109375], [583.9151000976562, 355.8670654296875, 6.8811187744140625], [596.3876342773438, 356.340576171875, 6.8712615966796875], [608.99560546875, 356.22100830078125, 6.8746795654296875], [648.081787109375, 352.85076904296875, 6.94110107421875], [612.7412719726562, 351.5333251953125, 6.865570068359375], [598.9871215820312, 351.8242492675781, 6.8616485595703125], [585.3312377929688, 352.4969482421875, 6.87408447265625], [464.1539001464844, 202.29954528808594, 7.4058380126953125], [465.8164978027344, 244.8143768310547, 7.313018798828125], [469.96026611328125, 282.73333740234375, 7.331451416015625], [474.998779296875, 318.5062255859375, 7.377685546875], [485.900390625, 354.82257080078125, 7.34814453125], [503.9440002441406, 389.1557922363281, 7.29644775390625], [533.9607543945312, 420.1808776855469, 7.2111968994140625], [569.1990356445312, 439.69488525390625, 7.0761260986328125], [604.7715454101562, 445.1242370605469, 7.0256805419921875], [641.609130859375, 438.5807189941406, 7.05670166015625], [677.1731567382812, 419.1774597167969, 7.1628265380859375], [709.558349609375, 390.3476867675781, 7.262908935546875], [728.9358520507812, 358.6229553222656, 7.3195648193359375], [743.6824951171875, 323.7010192871094, 7.3823699951171875], [752.355224609375, 286.009033203125, 7.3757171630859375], [756.031494140625, 248.0742645263672, 7.3575439453125], [756.6275634765625, 206.8378448486328, 7.39019775390625]], "keypoints_valid": [[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0]], "camera_param": {"focal": [34553.93155415853, 34553.93075942993], "princpt": [605.3033752441406, 358.99560546875]}}], "categories": [{"supercategory": "person", "id": 1, "name": "person"}]}
\ No newline at end of file
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py
new file mode 100644
index 0000000000..12f780e1a0
--- /dev/null
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+
+from mmpose.datasets.datasets.body3d import UBody3dDataset
+
+
+class TestUBody3dDataset(TestCase):
+
+    def build_ubody3d_dataset(self, **kwargs):
+
+        cfg = dict(
+            ann_file='ubody3d_train.json',
+            data_mode='topdown',
+            data_root='tests/data/ubody3d',
+            pipeline=[],
+            test_mode=False)
+
+        cfg.update(kwargs)
+        return UBody3dDataset(**cfg)
+
+    def check_data_info_keys(self, data_info: dict):
+        expected_keys = dict(
+            img_paths=list,
+            keypoints=np.ndarray,
+            keypoints_3d=np.ndarray,
+            scale=np.ndarray,
+            center=np.ndarray,
+            id=int)
+
+        for key, type_ in expected_keys.items():
+            self.assertIn(key, data_info)
+            self.assertIsInstance(data_info[key], type_, key)
+
+    def test_metainfo(self):
+        dataset = self.build_ubody3d_dataset()
+        # test dataset_name
+        self.assertEqual(dataset.metainfo['dataset_name'], 'ubody3d')
+
+        # test number of keypoints
+        num_keypoints = 137
+        self.assertEqual(dataset.metainfo['num_keypoints'], num_keypoints)
+        self.assertEqual(
+            len(dataset.metainfo['keypoint_colors']), num_keypoints)
+        self.assertEqual(
+            len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints)
+
+        # test some extra metainfo
+        self.assertEqual(
+            len(dataset.metainfo['skeleton_links']),
+            len(dataset.metainfo['skeleton_link_colors']))
+
+    def test_topdown(self):
+        # test topdown training
+        dataset = self.build_ubody3d_dataset(data_mode='topdown')
+        dataset.full_init()
+        self.assertEqual(len(dataset), 1)
+        self.check_data_info_keys(dataset[0])
+
+        # test topdown testing
+        dataset = self.build_ubody3d_dataset(
+            data_mode='topdown', test_mode=True)
+        dataset.full_init()
+        self.assertEqual(len(dataset), 1)
+        self.check_data_info_keys(dataset[0])
+
+        # test topdown training with sequence config
+        dataset = self.build_ubody3d_dataset(
+            data_mode='topdown',
+            seq_len=1,
+            seq_step=1,
+            causal=False,
+            pad_video_seq=True)
+        dataset.full_init()
+        self.assertEqual(len(dataset), 1)
+        self.check_data_info_keys(dataset[0])

From cf43e2fb1722627aedcd28b3bf756f2181b87d6b Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 19 Sep 2023 11:19:34 +0800
Subject: [PATCH 18/21] fix ut

---
 ...ose-lift_motionbert-ft_8xb32-120e_ubody.py | 188 ------------------
 mmpose/datasets/transforms/converting.py      |  29 ++-
 2 files changed, 12 insertions(+), 205 deletions(-)
 delete mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py

diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
deleted file mode 100644
index 16546693c9..0000000000
--- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py
+++ /dev/null
@@ -1,188 +0,0 @@
-_base_ = ['../../../_base_/default_runtime.py']
-
-vis_backends = [
-    dict(type='LocalVisBackend'),
-]
-visualizer = dict(
-    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-
-# runtime
-train_cfg = dict(max_epochs=120, val_interval=10)
-
-# optimizer
-optim_wrapper = dict(
-    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
-
-# learning policy
-param_scheduler = [
-    dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
-]
-
-auto_scale_lr = dict(base_batch_size=512)
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(
-        type='CheckpointHook',
-        save_best='MPJPE',
-        rule='less',
-        max_keep_ckpts=1),
-    logger=dict(type='LoggerHook', interval=20),
-)
-
-# codec settings
-train_codec = dict(
-    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
-val_codec = dict(
-    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
-
-# model settings
-model = dict(
-    type='PoseLifter',
-    backbone=dict(
-        type='DSTFormer',
-        in_channels=3,
-        feat_size=512,
-        depth=5,
-        num_heads=8,
-        mlp_ratio=2,
-        seq_len=120,
-        att_fuse=True,
-    ),
-    head=dict(
-        type='MotionRegressionHead',
-        in_channels=512,
-        out_channels=3,
-        embedding_size=512,
-        loss=dict(type='MPJPEVelocityJointLoss'),
-        decoder=val_codec,
-    ),
-    test_cfg=dict(flip_test=True),
-    init_cfg=dict(
-        type='Pretrained',
-        checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
-        'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
-)
-
-# base dataset settings
-dataset_type = 'UBody3dDataset'
-data_mode = 'topdown'
-data_root = 'data/UBody/'
-
-# mapping
-ubody_h36m = [
-    (0, 0),
-    (2, 1),
-    (4, 2),
-    (6, 3),
-    (1, 4),
-    (3, 5),
-    (5, 6),
-    ((0, 7), 7),
-    (7, 8),
-    ((7, 24), 9),
-    (24, 10),
-    (8, 11),
-    (10, 12),
-    (12, 13),
-    (9, 14),
-    (11, 15),
-    (13, 16),
-]
-
-scenes = [
-    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
-    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
-    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
-]
-
-train_datasets = []
-val_datasets = []
-
-for scene in scenes:
-    train_dataset = dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json',
-        seq_len=1,
-        multiple_target=120,
-        multiple_target_step=60,
-        data_prefix=dict(img='images/'),
-        pipeline=[],
-    )
-    if scene in ['Speech', 'Movie']:
-        continue
-    val_dataset = dict(
-        type=dataset_type,
-        ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json',
-        seq_len=1,
-        seq_step=1,
-        multiple_target=120,
-        data_root=data_root,
-        data_prefix=dict(img='images/'),
-        pipeline=[],
-        test_mode=True,
-    )
-    train_datasets.append(train_dataset)
-    val_datasets.append(val_dataset)
-
-# pipelines
-train_pipeline = [
-    dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m),
-    dict(type='GenerateTarget', encoder=train_codec),
-    dict(
-        type='RandomFlipAroundRoot',
-        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
-        target_flip_cfg=dict(center_mode='static', center_x=0.),
-        flip_label=True),
-    dict(
-        type='PackPoseInputs',
-        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
-                   'factor', 'camera_param'))
-]
-val_pipeline = [
-    dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m),
-    dict(type='GenerateTarget', encoder=val_codec),
-    dict(
-        type='PackPoseInputs',
-        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
-                   'factor', 'camera_param'))
-]
-
-# data loaders
-train_dataloader = dict(
-    batch_size=32,
-    prefetch_factor=4,
-    pin_memory=True,
-    num_workers=2,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='CombinedDataset',
-        datasets=train_datasets,
-        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
-        pipeline=train_pipeline,
-        test_mode=False))
-
-val_dataloader = dict(
-    batch_size=32,
-    prefetch_factor=4,
-    pin_memory=True,
-    num_workers=2,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
-    dataset=dict(
-        type='CombinedDataset',
-        metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'),
-        datasets=val_datasets,
-        pipeline=val_pipeline,
-        test_mode=True,
-    ))
-test_dataloader = val_dataloader
-
-# evaluators
-val_evaluator = [
-    dict(type='SimpleMPJPE', mode='mpjpe'),
-    dict(type='SimpleMPJPE', mode='p-mpjpe')
-]
-test_evaluator = val_evaluator
diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index 457234c4ee..d47829d06e 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -91,14 +91,13 @@ def transform(self, results: dict) -> dict:
         num_instances = results['keypoints'].shape[0]
 
         # Initialize output arrays
-        keypoints = np.zeros((num_instances, self.num_keypoints, 2),
-                             dtype=np.float32)
-        keypoints_visible = np.zeros((num_instances, self.num_keypoints),
-                                     dtype=np.float32)
-        keypoints_3d = None
+        keypoints = np.zeros((num_instances, self.num_keypoints, 2))
+        keypoints_visible = np.zeros((num_instances, self.num_keypoints))
+
         if 'keypoints_3d' in results:
             keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3),
                                     dtype=np.float32)
+        flip_indices = results.get('flip_indices', None)
 
         # Create a mask to weight visibility loss
         keypoints_visible_weights = keypoints_visible.copy()
@@ -119,6 +118,13 @@ def transform(self, results: dict) -> dict:
                     results['keypoints_3d'][:, self.source_index] +
                     results['keypoints_3d'][:, self.source_index2])
 
+            # Flip keypoints if flip_indices provided
+            if flip_indices is not None:
+                for i, (x1, x2) in enumerate(
+                        zip(self.source_index, self.source_index2)):
+                    id = flip_indices[x1] if x1 == x2 else i
+                    flip_indices[i] = id if id < self.num_keypoints else i
+                flip_indices = flip_indices[:len(self.source_index)]
         # Otherwise just copy from the source index
         else:
             keypoints[:,
@@ -139,19 +145,8 @@ def transform(self, results: dict) -> dict:
             results['lifting_target'] = keypoints_3d[results['target_idx']]
             results['lifting_target_visible'] = keypoints_visible[
                 results['target_idx']]
+        results['flip_indices'] = flip_indices
 
-        # Updatae flip pairs
-        if 'flip_indices' in results:
-            flip_indices = []
-            for i in range(len(self.target_index)):
-                x1, x2 = self.source_index[i], self.source_index2[i]
-                if x1 == x2:
-                    flip_id = results['flip_indices'][x1]
-                    flip_id = flip_id if flip_id < self.num_keypoints else i
-                    flip_indices.append(flip_id)
-                else:
-                    flip_indices.append(i)
-            results['flip_indices'] = flip_indices
         return results
 
     def transform_sigmas(self, sigmas: Union[List, np.ndarray]):

From 748432d047b65cb7c9bffd4828e0a8e56feb0dc9 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 19 Sep 2023 15:02:53 +0800
Subject: [PATCH 19/21] add preparation doc

---
 configs/_base_/datasets/ubody3d.py      | 150 +++++++++++++-----------
 docs/en/dataset_zoo/3d_body_keypoint.md |  98 ++++++++++++++++
 2 files changed, 181 insertions(+), 67 deletions(-)

diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py
index e2dfe0c570..9242559ea1 100644
--- a/configs/_base_/datasets/ubody3d.py
+++ b/configs/_base_/datasets/ubody3d.py
@@ -169,7 +169,7 @@
             swap='R_Thumb_3'),
         28:
         dict(
-            name='L_Thumb4',
+            name='L_Thumb_4',
             id=28,
             color=[255, 128, 0],
             type='',
@@ -313,7 +313,7 @@
             id=48,
             color=[255, 128, 0],
             type='',
-            swap='L_Thumb4'),
+            swap='L_Thumb_4'),
         49:
         dict(
             name='R_Index_1',
@@ -872,71 +872,87 @@
             swap='Face_56'),
     },
     skeleton_info={
-        0: dict(link=('L_Hip', 'R_Hip'), id=0, color=[0, 255, 0]),
-        1: dict(link=('L_Knee', 'R_Knee'), id=1, color=[0, 255, 0]),
-        2: dict(link=('L_Ankle', 'R_Ankle'), id=2, color=[0, 255, 0]),
-        3: dict(link=('L_Shoulder', 'R_Shoulder'), id=3, color=[0, 255, 0]),
-        4: dict(link=('L_Elbow', 'R_Elbow'), id=4, color=[0, 255, 0]),
-        5: dict(link=('L_Wrist', 'R_Wrist'), id=5, color=[0, 255, 0]),
-        6: dict(link=('L_Big_toe', 'R_Big_toe'), id=6, color=[0, 255, 0]),
-        7: dict(link=('L_Small_toe', 'R_Small_toe'), id=7, color=[0, 255, 0]),
-        8: dict(link=('L_Heel', 'R_Heel'), id=8, color=[0, 255, 0]),
-        9: dict(link=('L_Ear', 'R_Ear'), id=9, color=[0, 255, 0]),
-        10: dict(link=('L_Eye', 'R_Eye'), id=10, color=[0, 255, 0]),
-        11: dict(link=('L_Thumb_1', 'R_Thumb_1'), id=11, color=[255, 128, 0]),
-        12: dict(link=('L_Thumb_2', 'R_Thumb_2'), id=12, color=[255, 128, 0]),
-        13: dict(link=('L_Thumb_3', 'R_Thumb_3'), id=13, color=[255, 128, 0]),
-        14: dict(link=('L_Thumb4', 'R_Thumb_4'), id=14, color=[255, 128, 0]),
-        15: dict(link=('L_Index_1', 'R_Index_1'), id=15, color=[255, 128, 0]),
-        16: dict(link=('L_Index_2', 'R_Index_2'), id=16, color=[255, 128, 0]),
-        17: dict(link=('L_Index_3', 'R_Index_3'), id=17, color=[255, 128, 0]),
-        18: dict(link=('L_Index_4', 'R_Index_4'), id=18, color=[255, 128, 0]),
-        19:
-        dict(link=('L_Middle_1', 'R_Middle_1'), id=19, color=[255, 128, 0]),
-        20:
-        dict(link=('L_Middle_2', 'R_Middle_2'), id=20, color=[255, 128, 0]),
-        21:
-        dict(link=('L_Middle_3', 'R_Middle_3'), id=21, color=[255, 128, 0]),
-        22:
-        dict(link=('L_Middle_4', 'R_Middle_4'), id=22, color=[255, 128, 0]),
-        23: dict(link=('L_Ring_1', 'R_Ring_1'), id=23, color=[255, 128, 0]),
-        24: dict(link=('L_Ring_2', 'R_Ring_2'), id=24, color=[255, 128, 0]),
-        25: dict(link=('L_Ring_3', 'R_Ring_3'), id=25, color=[255, 128, 0]),
-        26: dict(link=('L_Ring_4', 'R_Ring_4'), id=26, color=[255, 128, 0]),
-        27: dict(link=('L_Pinky_1', 'R_Pinky_1'), id=27, color=[255, 128, 0]),
-        28: dict(link=('L_Pinky_2', 'R_Pinky_2'), id=28, color=[255, 128, 0]),
-        29: dict(link=('L_Pinky_3', 'R_Pinky_3'), id=29, color=[255, 128, 0]),
-        30: dict(link=('L_Pinky_4', 'R_Pinky_4'), id=30, color=[255, 128, 0]),
-        31: dict(link=('Face_3', 'Face_4'), id=31, color=[255, 255, 255]),
-        32: dict(link=('Face_5', 'Face_14'), id=32, color=[255, 255, 255]),
-        33: dict(link=('Face_6', 'Face_13'), id=33, color=[255, 255, 255]),
-        34: dict(link=('Face_7', 'Face_12'), id=34, color=[255, 255, 255]),
-        35: dict(link=('Face_8', 'Face_11'), id=35, color=[255, 255, 255]),
-        36: dict(link=('Face_9', 'Face_10'), id=36, color=[255, 255, 255]),
-        37: dict(link=('Face_19', 'Face_23'), id=37, color=[255, 255, 255]),
-        38: dict(link=('Face_20', 'Face_22'), id=38, color=[255, 255, 255]),
-        39: dict(link=('Face_24', 'Face_33'), id=39, color=[255, 255, 255]),
-        40: dict(link=('Face_25', 'Face_32'), id=40, color=[255, 255, 255]),
-        41: dict(link=('Face_26', 'Face_31'), id=41, color=[255, 255, 255]),
-        42: dict(link=('Face_27', 'Face_30'), id=42, color=[255, 255, 255]),
-        43: dict(link=('Face_28', 'Face_35'), id=43, color=[255, 255, 255]),
-        44: dict(link=('Face_29', 'Face_34'), id=44, color=[255, 255, 255]),
-        45: dict(link=('Face_36', 'Face_42'), id=45, color=[255, 255, 255]),
-        46: dict(link=('Face_37', 'Face_41'), id=46, color=[255, 255, 255]),
-        47: dict(link=('Face_38', 'Face_40'), id=47, color=[255, 255, 255]),
-        48: dict(link=('Face_43', 'Face_47'), id=48, color=[255, 255, 255]),
-        49: dict(link=('Face_44', 'Face_46'), id=49, color=[255, 255, 255]),
-        50: dict(link=('Face_48', 'Face_52'), id=50, color=[255, 255, 255]),
-        51: dict(link=('Face_49', 'Face_51'), id=51, color=[255, 255, 255]),
-        52: dict(link=('Face_53', 'Face_55'), id=52, color=[255, 255, 255]),
-        53: dict(link=('Face_56', 'Face_72'), id=53, color=[255, 255, 255]),
-        54: dict(link=('Face_57', 'Face_71'), id=54, color=[255, 255, 255]),
-        55: dict(link=('Face_58', 'Face_70'), id=55, color=[255, 255, 255]),
-        56: dict(link=('Face_59', 'Face_69'), id=56, color=[255, 255, 255]),
-        57: dict(link=('Face_60', 'Face_68'), id=57, color=[255, 255, 255]),
-        58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]),
-        59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]),
-        60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]),
+        0: dict(link=('L_Ankle', 'L_Knee'), id=0, color=[0, 255, 0]),
+        1: dict(link=('L_Knee', 'L_Hip'), id=1, color=[0, 255, 0]),
+        2: dict(link=('R_Ankle', 'R_Knee'), id=2, color=[0, 255, 0]),
+        3: dict(link=('R_Knee', 'R_Hip'), id=3, color=[0, 255, 0]),
+        4: dict(link=('L_Hip', 'R_Hip'), id=4, color=[0, 255, 0]),
+        5: dict(link=('L_Shoulder', 'L_Hip'), id=5, color=[0, 255, 0]),
+        6: dict(link=('R_Shoulder', 'R_Hip'), id=6, color=[0, 255, 0]),
+        7: dict(link=('L_Shoulder', 'R_Shoulder'), id=7, color=[0, 255, 0]),
+        8: dict(link=('L_Shoulder', 'L_Elbow'), id=8, color=[0, 255, 0]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_Elbow', 'L_Wrist'), id=10, color=[0, 255, 0]),
+        11: dict(link=('R_Elbow', 'R_Wrist'), id=11, color=[255, 128, 0]),
+        12: dict(link=('L_Eye', 'R_Eye'), id=12, color=[255, 128, 0]),
+        13: dict(link=('Nose', 'L_Eye'), id=13, color=[255, 128, 0]),
+        14: dict(link=('Nose', 'R_Eye'), id=14, color=[255, 128, 0]),
+        15: dict(link=('L_Eye', 'L_Ear'), id=15, color=[255, 128, 0]),
+        16: dict(link=('R_Eye', 'R_Ear'), id=16, color=[255, 128, 0]),
+        17: dict(link=('L_Ear', 'L_Shoulder'), id=17, color=[255, 128, 0]),
+        18: dict(link=('R_Ear', 'R_Shoulder'), id=18, color=[255, 128, 0]),
+        19: dict(link=('L_Ankle', 'L_Big_toe'), id=19, color=[255, 128, 0]),
+        20: dict(link=('L_Ankle', 'L_Small_toe'), id=20, color=[255, 128, 0]),
+        21: dict(link=('L_Ankle', 'L_Heel'), id=21, color=[255, 128, 0]),
+        22: dict(link=('R_Ankle', 'R_Big_toe'), id=22, color=[255, 128, 0]),
+        23: dict(link=('R_Ankle', 'R_Small_toe'), id=23, color=[255, 128, 0]),
+        24: dict(link=('R_Ankle', 'R_Heel'), id=24, color=[255, 128, 0]),
+        25: dict(link=('L_Wrist', 'L_Thumb_1'), id=25, color=[255, 128, 0]),
+        26: dict(link=('L_Thumb_1', 'L_Thumb_2'), id=26, color=[255, 128, 0]),
+        27: dict(link=('L_Thumb_2', 'L_Thumb_3'), id=27, color=[255, 128, 0]),
+        28: dict(link=('L_Thumb_3', 'L_Thumb_4'), id=28, color=[255, 128, 0]),
+        29: dict(link=('L_Wrist', 'L_Index_1'), id=29, color=[255, 128, 0]),
+        30: dict(link=('L_Index_1', 'L_Index_2'), id=30, color=[255, 128, 0]),
+        31:
+        dict(link=('L_Index_2', 'L_Index_3'), id=31, color=[255, 255, 255]),
+        32:
+        dict(link=('L_Index_3', 'L_Index_4'), id=32, color=[255, 255, 255]),
+        33: dict(link=('L_Wrist', 'L_Middle_1'), id=33, color=[255, 255, 255]),
+        34:
+        dict(link=('L_Middle_1', 'L_Middle_2'), id=34, color=[255, 255, 255]),
+        35:
+        dict(link=('L_Middle_2', 'L_Middle_3'), id=35, color=[255, 255, 255]),
+        36:
+        dict(link=('L_Middle_3', 'L_Middle_4'), id=36, color=[255, 255, 255]),
+        37: dict(link=('L_Wrist', 'L_Ring_1'), id=37, color=[255, 255, 255]),
+        38: dict(link=('L_Ring_1', 'L_Ring_2'), id=38, color=[255, 255, 255]),
+        39: dict(link=('L_Ring_2', 'L_Ring_3'), id=39, color=[255, 255, 255]),
+        40: dict(link=('L_Ring_3', 'L_Ring_4'), id=40, color=[255, 255, 255]),
+        41: dict(link=('L_Wrist', 'L_Pinky_1'), id=41, color=[255, 255, 255]),
+        42:
+        dict(link=('L_Pinky_1', 'L_Pinky_2'), id=42, color=[255, 255, 255]),
+        43:
+        dict(link=('L_Pinky_2', 'L_Pinky_3'), id=43, color=[255, 255, 255]),
+        44:
+        dict(link=('L_Pinky_3', 'L_Pinky_4'), id=44, color=[255, 255, 255]),
+        45: dict(link=('R_Wrist', 'R_Thumb_1'), id=45, color=[255, 255, 255]),
+        46:
+        dict(link=('R_Thumb_1', 'R_Thumb_2'), id=46, color=[255, 255, 255]),
+        47:
+        dict(link=('R_Thumb_2', 'R_Thumb_3'), id=47, color=[255, 255, 255]),
+        48:
+        dict(link=('R_Thumb_3', 'R_Thumb_4'), id=48, color=[255, 255, 255]),
+        49: dict(link=('R_Wrist', 'R_Index_1'), id=49, color=[255, 255, 255]),
+        50:
+        dict(link=('R_Index_1', 'R_Index_2'), id=50, color=[255, 255, 255]),
+        51:
+        dict(link=('R_Index_2', 'R_Index_3'), id=51, color=[255, 255, 255]),
+        52:
+        dict(link=('R_Index_3', 'R_Index_4'), id=52, color=[255, 255, 255]),
+        53: dict(link=('R_Wrist', 'R_Middle_1'), id=53, color=[255, 255, 255]),
+        54:
+        dict(link=('R_Middle_1', 'R_Middle_2'), id=54, color=[255, 255, 255]),
+        55:
+        dict(link=('R_Middle_2', 'R_Middle_3'), id=55, color=[255, 255, 255]),
+        56:
+        dict(link=('R_Middle_3', 'R_Middle_4'), id=56, color=[255, 255, 255]),
+        57: dict(link=('R_Wrist', 'R_Pinky_1'), id=57, color=[255, 255, 255]),
+        58:
+        dict(link=('R_Pinky_1', 'R_Pinky_2'), id=58, color=[255, 255, 255]),
+        59:
+        dict(link=('R_Pinky_2', 'R_Pinky_3'), id=59, color=[255, 255, 255]),
+        60:
+        dict(link=('R_Pinky_3', 'R_Pinky_4'), id=60, color=[255, 255, 255]),
     },
     joint_weights=[1.] * 137,
     sigmas=[])
diff --git a/docs/en/dataset_zoo/3d_body_keypoint.md b/docs/en/dataset_zoo/3d_body_keypoint.md
index 82e21010fc..3a35e2443b 100644
--- a/docs/en/dataset_zoo/3d_body_keypoint.md
+++ b/docs/en/dataset_zoo/3d_body_keypoint.md
@@ -8,6 +8,7 @@ MMPose supported datasets:
 - [Human3.6M](#human36m) \[ [Homepage](http://vision.imar.ro/human3.6m/description.php) \]
 - [CMU Panoptic](#cmu-panoptic) \[ [Homepage](http://domedb.perception.cs.cmu.edu/) \]
 - [Campus/Shelf](#campus-and-shelf) \[ [Homepage](http://campar.in.tum.de/Chair/MultiHumanPose) \]
+- [UBody](#ubody3d) \[ [Homepage](https://osx-ubody.github.io/) \]
 
 ## Human3.6M
 
@@ -197,3 +198,100 @@ mmpose
     |   ├── pred_shelf_maskrcnn_hrnet_coco.pkl
     |   ├── actorsGT.mat
 ```
+
+## UBody3d
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2303.16160">UBody (CVPR'2023)</a></summary>
+
+```bibtex
+@article{lin2023one,
+  title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer},
+  author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2023},
+}
+```
+
+</details>
+
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmpose/assets/15952744/0c97e43a-46a9-46a3-a5dd-b84bf9d6d6f2" height="300px">
+</div>
+
+For [Ubody](https://github.com/IDEA-Research/OSX) dataset, videos and annotations can be downloaded from [OSX homepage](https://github.com/IDEA-Research/OSX).
+
+Download and extract them under $MMPOSE/data, and make them look like this:
+
+```text
+mmpose
+├── mmpose
+├── docs
+├── tests
+├── tools
+├── configs
+`── data
+    │── UBody
+        ├── annotations
+        │   ├── ConductMusic
+        │   ├── Entertainment
+        │   ├── Fitness
+        │   ├── Interview
+        │   ├── LiveVlog
+        │   ├── Magic_show
+        │   ├── Movie
+        │   ├── Olympic
+        │   ├── Online_class
+        │   ├── SignLanguage
+        │   ├── Singing
+        │   ├── Speech
+        │   ├── TVShow
+        │   ├── TalkShow
+        │   └── VideoConference
+        ├── splits
+        │   ├── inter_scene_test_list.npy
+        │   └── intra_scene_test_list.npy
+        ├── videos
+        │   ├── ConductMusic
+        │   ├── Entertainment
+        │   ├── Fitness
+        │   ├── Interview
+        │   ├── LiveVlog
+        │   ├── Magic_show
+        │   ├── Movie
+        │   ├── Olympic
+        │   ├── Online_class
+        │   ├── SignLanguage
+        │   ├── Singing
+        │   ├── Speech
+        │   ├── TVShow
+        │   ├── TalkShow
+        │   └── VideoConference
+```
+
+Convert videos to images then split them into train/val set:
+
+```shell
+python tools/dataset_converters/ubody_kpts_to_coco.py
+```
+
+Before generating 3D keypoints, you need to install SMPLX tools and download human models, please refer to [Github](https://github.com/vchoutas/smplx#installation) and [SMPLX](https://smpl-x.is.tue.mpg.de/download.php).
+
+```shell
+pip install smplx
+```
+
+The directory tree of human models should be like this:
+
+```text
+human_model_path
+|── smplx
+    ├── SMPLX_NEUTRAL.npz
+    ├── SMPLX_NEUTRAL.pkl
+```
+
+After the above preparations are finished, execute the following script:
+
+```shell
+python tools/dataset_converters/ubody_smplx_to_coco.py --data-root {$MMPOSE/data/UBody} --human-model-path {$MMPOSE/data/human_model_path/}
+```

From b296a2aec8b6ba4d9a6dd80ef4cf33c376d65234 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Tue, 19 Sep 2023 19:24:07 +0800
Subject: [PATCH 20/21] update keypoint convert

---
 mmpose/datasets/transforms/converting.py | 39 ++++++++----------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index d47829d06e..90536bd736 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -91,12 +91,10 @@ def transform(self, results: dict) -> dict:
         num_instances = results['keypoints'].shape[0]
 
         # Initialize output arrays
-        keypoints = np.zeros((num_instances, self.num_keypoints, 2))
+        keypoints = np.zeros((num_instances, self.num_keypoints, 3))
         keypoints_visible = np.zeros((num_instances, self.num_keypoints))
+        key = 'keypoints_3d' if 'keypoints_3d' in results else 'keypoints'
 
-        if 'keypoints_3d' in results:
-            keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3),
-                                    dtype=np.float32)
         flip_indices = results.get('flip_indices', None)
 
         # Create a mask to weight visibility loss
@@ -106,43 +104,32 @@ def transform(self, results: dict) -> dict:
         # Interpolate keypoints if pairs of source indexes provided
         if self.interpolation:
             keypoints[:, self.target_index] = 0.5 * (
-                results['keypoints'][:, self.source_index] +
-                results['keypoints'][:, self.source_index2])
-
+                results[key][:, self.source_index] +
+                results[key][:, self.source_index2])
             keypoints_visible[:, self.target_index] = results[
-                'keypoints_visible'][:, self.source_index] * \
-                results['keypoints_visible'][:, self.source_index2]
-
-            if 'keypoints_3d' in results:
-                keypoints_3d[:, self.target_index] = 0.5 * (
-                    results['keypoints_3d'][:, self.source_index] +
-                    results['keypoints_3d'][:, self.source_index2])
-
+                'keypoints_visible'][:, self.source_index] * results[
+                    'keypoints_visible'][:, self.source_index2]
             # Flip keypoints if flip_indices provided
             if flip_indices is not None:
                 for i, (x1, x2) in enumerate(
                         zip(self.source_index, self.source_index2)):
-                    id = flip_indices[x1] if x1 == x2 else i
-                    flip_indices[i] = id if id < self.num_keypoints else i
+                    idx = flip_indices[x1] if x1 == x2 else i
+                    flip_indices[i] = idx if idx < self.num_keypoints else i
                 flip_indices = flip_indices[:len(self.source_index)]
         # Otherwise just copy from the source index
         else:
-            keypoints[:,
-                      self.target_index] = results['keypoints'][:, self.
-                                                                source_index]
+            keypoints[:, self.target_index] = results[key][:,
+                                                           self.source_index]
             keypoints_visible[:, self.target_index] = results[
                 'keypoints_visible'][:, self.source_index]
-            if 'keypoints_3d' in results:
-                keypoints_3d[:, self.target_index] = results[
-                    'keypoints_3d'][:, self.source_index]
 
         # Update the results dict
-        results['keypoints'] = keypoints
+        results['keypoints'] = keypoints[..., :2]
         results['keypoints_visible'] = np.stack(
             [keypoints_visible, keypoints_visible_weights], axis=2)
         if 'keypoints_3d' in results:
-            results['keypoints_3d'] = keypoints_3d
-            results['lifting_target'] = keypoints_3d[results['target_idx']]
+            results['keypoints_3d'] = keypoints
+            results['lifting_target'] = keypoints[results['target_idx']]
             results['lifting_target_visible'] = keypoints_visible[
                 results['target_idx']]
         results['flip_indices'] = flip_indices

From 693f087bfb28b7b4aa0b9bbf0b2c135e358dcbce Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Wed, 20 Sep 2023 10:31:56 +0800
Subject: [PATCH 21/21] fix 2d kpts

---
 mmpose/datasets/transforms/converting.py      |  8 +++---
 .../test_transforms/test_converting.py        | 27 +++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py
index 90536bd736..1906f16972 100644
--- a/mmpose/datasets/transforms/converting.py
+++ b/mmpose/datasets/transforms/converting.py
@@ -94,6 +94,7 @@ def transform(self, results: dict) -> dict:
         keypoints = np.zeros((num_instances, self.num_keypoints, 3))
         keypoints_visible = np.zeros((num_instances, self.num_keypoints))
         key = 'keypoints_3d' if 'keypoints_3d' in results else 'keypoints'
+        c = results[key].shape[-1]
 
         flip_indices = results.get('flip_indices', None)
 
@@ -103,7 +104,7 @@ def transform(self, results: dict) -> dict:
 
         # Interpolate keypoints if pairs of source indexes provided
         if self.interpolation:
-            keypoints[:, self.target_index] = 0.5 * (
+            keypoints[:, self.target_index, :c] = 0.5 * (
                 results[key][:, self.source_index] +
                 results[key][:, self.source_index2])
             keypoints_visible[:, self.target_index] = results[
@@ -118,8 +119,9 @@ def transform(self, results: dict) -> dict:
                 flip_indices = flip_indices[:len(self.source_index)]
         # Otherwise just copy from the source index
         else:
-            keypoints[:, self.target_index] = results[key][:,
-                                                           self.source_index]
+            keypoints[:,
+                      self.target_index, :c] = results[key][:,
+                                                            self.source_index]
             keypoints_visible[:, self.target_index] = results[
                 'keypoints_visible'][:, self.source_index]
 
diff --git a/tests/test_datasets/test_transforms/test_converting.py b/tests/test_datasets/test_transforms/test_converting.py
index 5cce813b70..dc4376baf9 100644
--- a/tests/test_datasets/test_transforms/test_converting.py
+++ b/tests/test_datasets/test_transforms/test_converting.py
@@ -81,6 +81,33 @@ def test_transform(self):
                      self.data_info['keypoints_visible'][:,
                                                          source_index]).all())
 
+        # check 3d keypoint
+        self.data_info['keypoints_3d'] = np.random.random((4, 17, 3))
+        self.data_info['target_idx'] = [-1]
+        mapping = [(3, 0), (6, 1), (16, 2), (5, 3)]
+        transform = KeypointConverter(num_keypoints=5, mapping=mapping)
+        results = transform(self.data_info.copy())
+
+        # check shape
+        self.assertEqual(results['keypoints_3d'].shape[0],
+                         self.data_info['keypoints_3d'].shape[0])
+        self.assertEqual(results['keypoints_3d'].shape[1], 5)
+        self.assertEqual(results['keypoints_3d'].shape[2], 3)
+        self.assertEqual(results['keypoints_visible'].shape[0],
+                         self.data_info['keypoints_visible'].shape[0])
+        self.assertEqual(results['keypoints_visible'].shape[1], 5)
+
+        # check value
+        for source_index, target_index in mapping:
+            self.assertTrue(
+                (results['keypoints_3d'][:, target_index] ==
+                 self.data_info['keypoints_3d'][:, source_index]).all())
+            self.assertEqual(results['keypoints_visible'].ndim, 3)
+            self.assertEqual(results['keypoints_visible'].shape[2], 2)
+            self.assertTrue(
+                (results['keypoints_visible'][:, target_index, 0] ==
+                 self.data_info['keypoints_visible'][:, source_index]).all())
+
     def test_transform_sigmas(self):
 
         mapping = [(3, 0), (6, 1), (16, 2), (5, 3)]