From f534c76b53b465acc39fb7a902a66c635f708392 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 22 Aug 2023 15:06:39 +0800 Subject: [PATCH 01/21] --feat=add process script --- .../dataset_converters/ubody_smplx_to_coco.py | 419 ++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 tools/dataset_converters/ubody_smplx_to_coco.py diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py new file mode 100644 index 0000000000..8bb796377f --- /dev/null +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -0,0 +1,419 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import os +import os.path as osp +from functools import partial +from typing import Dict, List + +import mmengine +import numpy as np +import smplx +import torch +from pycocotools.coco import COCO + + +class SMPLX(object): + + def __init__(self, human_model_path): + self.human_model_path = human_model_path + self.layer_args = { + 'create_global_orient': False, + 'create_body_pose': False, + 'create_left_hand_pose': False, + 'create_right_hand_pose': False, + 'create_jaw_pose': False, + 'create_leye_pose': False, + 'create_reye_pose': False, + 'create_betas': False, + 'create_expression': False, + 'create_transl': False, + } + + self.neutral_model = smplx.create( + self.human_model_path, + 'smplx', + gender='NEUTRAL', + use_pca=False, + use_face_contour=True, + **self.layer_args) + + self.vertex_num = 10475 + self.face = self.neutral_model.faces + self.shape_param_dim = 10 + self.expr_code_dim = 10 + # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint) + self.orig_joint_num = 53 + + # yapf: disable + self.orig_joints_name = ( + # 22 body joints + 'Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', + 'Spine2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', + 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', + 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', + # left hand joints + 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2', + 'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1', + 'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', + # right hand joints + 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2', + 'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1', + 'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', + # 1 face jaw joint + 'Jaw', + ) + self.orig_flip_pairs = ( + # body joints + (1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), + (20, 21), + # hand joints + (22, 37), (23, 38), (24, 39), (25, 40), (26, 41), (27, 42), + (28, 43), (29, 44), (30, 45), (31, 46), (32, 47), (33, 48), + (34, 49), (35, 50), (36, 51), + ) + # yapf: enable + self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis') + self.orig_joint_part = { + 'body': + range( + self.orig_joints_name.index('Pelvis'), + self.orig_joints_name.index('R_Wrist') + 1), + 'lhand': + range( + self.orig_joints_name.index('L_Index_1'), + self.orig_joints_name.index('L_Thumb_3') + 1), + 'rhand': + range( + self.orig_joints_name.index('R_Index_1'), + self.orig_joints_name.index('R_Thumb_3') + 1), + 'face': + range( + self.orig_joints_name.index('Jaw'), + self.orig_joints_name.index('Jaw') + 1) + } + + # changed SMPLX joint set for the supervision + self.joint_num = ( + 137 # 25 (body joints) + 40 (hand joints) + 72 (face keypoints) + ) + # yapf: disable + self.joints_name = ( + # 25 body joints + 'Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', + 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', + 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', + 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', + 'L_Eye', 'R_Eye', 'Nose', + # left hand joints + 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb4', 'L_Index_1', + 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', + 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', + 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', + # right hand joints + 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', + 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', + 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', + 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', + # 72 face keypoints + *[ + f'Face_{i}' for i in range(1, 73) + ], + ) + + self.root_joint_idx = self.joints_name.index('Pelvis') + self.lwrist_idx = self.joints_name.index('L_Wrist') + self.rwrist_idx = self.joints_name.index('R_Wrist') + self.neck_idx = self.joints_name.index('Neck') + self.flip_pairs = ( + # body joints + (1, 2), (3, 4), (5, 6), (8, 9), (10, 11), (12, 13), (14, 17), + (15, 18), (16, 19), (20, 21), (22, 23), + # hand joints + (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), (30, 50), + (31, 51), (32, 52), (33, 53), (34, 54), (35, 55), (36, 56), + (37, 57), (38, 58), (39, 59), (40, 60), (41, 61), (42, 62), + (43, 63), (44, 64), + # face eyebrow + (67, 68), (69, 78), (70, 77), (71, 76), (72, 75), (73, 74), + # face below nose + (83, 87), (84, 86), + # face eyes + (88, 97), (89, 96), (90, 95), (91, 94), (92, 99), (93, 98), + # face mouse + (100, 106), (101, 105), (102, 104), (107, 111), (108, 110), + # face lip + (112, 116), (113, 115), (117, 119), + # face contours + (120, 136), (121, 135), (122, 134), (123, 133), (124, 132), + (125, 131), (126, 130), (127, 129) + ) + self.joint_idx = ( + 0, 1, 2, 4, 5, 7, 8, 12, 16, 17, 18, 19, 20, 21, 60, 61, 62, 63, + 64, 65, 59, 58, 57, 56, 55, # body joints + 37, 38, 39, 66, 25, 26, 27, 67, 28, 29, 30, 68, 34, 35, 36, 69, 31, + 32, 33, 70, # left hand joints + 52, 53, 54, 71, 40, 41, 42, 72, 43, 44, 45, 73, 49, 50, 51, 74, 46, + 47, 48, 75, # right hand joints + 22, 15, # jaw, head + 57, 56, # eyeballs + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, # eyebrow + 86, 87, 88, 89, # nose + 90, 91, 92, 93, 94, # below nose + 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, # eyes + 107, # right mouth + 108, 109, 110, 111, 112, # upper mouth + 113, # left mouth + 114, 115, 116, 117, 118, # lower mouth + 119, # right lip + 120, 121, 122, # upper lip + 123, # left lip + 124, 125, 126, # lower lip + 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + 140, 141, 142, 143, # face contour + ) + # yapf: enable + + self.joint_part = { + 'body': + range( + self.joints_name.index('Pelvis'), + self.joints_name.index('Nose') + 1), + 'lhand': + range( + self.joints_name.index('L_Thumb_1'), + self.joints_name.index('L_Pinky_4') + 1), + 'rhand': + range( + self.joints_name.index('R_Thumb_1'), + self.joints_name.index('R_Pinky_4') + 1), + 'hand': + range( + self.joints_name.index('L_Thumb_1'), + self.joints_name.index('R_Pinky_4') + 1), + 'face': + range( + self.joints_name.index('Face_1'), + self.joints_name.index('Face_72') + 1) + } + + +def read_annotation_file(annotation_file: str) -> List[Dict]: + with open(annotation_file, 'r') as f: + annotations = json.load(f) + return annotations + + +def cam2pixel(cam_coord, f, c): + x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0] + y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1] + z = cam_coord[:, 2] + return np.stack((x, y, z), 1) + + +def process_scene_anno(scene: str, annotation_root: str, splits: np.array, + human_model: SMPLX): + annos = read_annotation_file( + osp.join(annotation_root, scene, 'smplx_annotation.json')) + keypoint_annos = COCO( + osp.join(annotation_root, scene, 'keypoint_annotation.json')) + + train_annos = [] + val_annos = [] + train_imgs = [] + val_imgs = [] + + progress_bar = mmengine.ProgressBar(len(keypoint_annos.anns.keys())) + for aid in keypoint_annos.anns.keys(): + ann = keypoint_annos.anns[aid] + img = keypoint_annos.loadImgs(ann['image_id'])[0] + if img['file_name'].startswith('/'): + file_name = img['file_name'][1:] + else: + file_name = img['file_name'] + + video_name = file_name.split('/')[-2] + if 'Trim' in video_name: + video_name = video_name.split('_Trim')[0] + + img_path = os.path.join( + annotation_root.replace('annotations', 'images'), scene, file_name) + if not os.path.exists(img_path): + progress_bar.update() + continue + + smplx_param = annos[str(aid)] + human_model_param = smplx_param['smplx_param'] + cam_param = smplx_param['cam_param'] + if 'lhand_valid' not in human_model_param: + human_model_param['lhand_valid'] = ann['lefthand_valid'] + human_model_param['rhand_valid'] = ann['righthand_valid'] + human_model_param['face_valid'] = ann['face_valid'] + + rotation_valid = np.ones((human_model.orig_joint_num), + dtype=np.float32) + coord_valid = np.ones((human_model.joint_num), dtype=np.float32) + + root_pose, body_pose, shape, trans = (human_model_param['root_pose'], + human_model_param['body_pose'], + human_model_param['shape'], + human_model_param['trans']) + + if 'lhand_pose' in human_model_param and human_model_param.get( + 'lhand_valid', False): + lhand_pose = human_model_param['lhand_pose'] + else: + lhand_pose = np.zeros( + (3 * len(human_model.orig_joint_part['lhand'])), + dtype=np.float32) + rotation_valid[human_model.orig_joint_part['lhand']] = 0 + coord_valid[human_model.orig_joint_part['lhand']] = 0 + + if 'rhand_pose' in human_model_param and human_model_param.get( + 'rhand_valid', False): + rhand_pose = human_model_param['rhand_pose'] + else: + rhand_pose = np.zeros( + (3 * len(human_model.orig_joint_part['rhand'])), + dtype=np.float32) + rotation_valid[human_model.orig_joint_part['rhand']] = 0 + coord_valid[human_model.orig_joint_part['rhand']] = 0 + + if 'jaw_pose' in human_model_param and \ + 'expr' in human_model_param and \ + human_model_param.get('face_valid', False): + jaw_pose = human_model_param['jaw_pose'] + expr = human_model_param['expr'] + else: + jaw_pose = np.zeros((3), dtype=np.float32) + expr = np.zeros((human_model.expr_code_dim), dtype=np.float32) + rotation_valid[human_model.orig_joint_part['face']] = 0 + coord_valid[human_model.orig_joint_part['face']] = 0 + + # init human model inputs + root_pose = torch.FloatTensor(root_pose).view(1, 3) + body_pose = torch.FloatTensor(body_pose).view(-1, 3) + lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3) + rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3) + jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3) + shape = torch.FloatTensor(shape).view(1, -1) + expr = torch.FloatTensor(expr).view(1, -1) + trans = torch.FloatTensor(trans).view(1, -1) + zero_pose = torch.zeros((1, 3), dtype=torch.float32) + with torch.no_grad(): + output = human_model.neutral_model( + betas=shape, + body_pose=body_pose.view(1, -1), + global_orient=root_pose, + transl=trans, + left_hand_pose=lhand_pose.view(1, -1), + right_hand_pose=rhand_pose.view(1, -1), + jaw_pose=jaw_pose.view(1, -1), + leye_pose=zero_pose, + reye_pose=zero_pose, + expression=expr) + + joint_cam = output.joints[0].numpy()[human_model.joint_idx, :] + joint_img = cam2pixel(joint_cam, cam_param['focal'], + cam_param['princpt']) + + joint_cam = (joint_cam - joint_cam[human_model.root_joint_idx, None, :] + ) # root-relative + joint_cam[human_model.joint_part['lhand'], :] = ( + joint_cam[human_model.joint_part['lhand'], :] - + joint_cam[human_model.lwrist_idx, None, :] + ) # left hand root-relative + joint_cam[human_model.joint_part['rhand'], :] = ( + joint_cam[human_model.joint_part['rhand'], :] - + joint_cam[human_model.rwrist_idx, None, :] + ) # right hand root-relative + joint_cam[human_model.joint_part['face'], :] = ( + joint_cam[human_model.joint_part['face'], :] - + joint_cam[human_model.neck_idx, None, :]) # face root-relative + + body_3d_size = 2 + output_hm_shape = (16, 16, 12) + joint_img[human_model.joint_part['body'], + 2] = ((joint_cam[human_model.joint_part['body'], 2].copy() / + (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) + joint_img[human_model.joint_part['lhand'], + 2] = ((joint_cam[human_model.joint_part['lhand'], 2].copy() / + (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) + joint_img[human_model.joint_part['rhand'], + 2] = ((joint_cam[human_model.joint_part['rhand'], 2].copy() / + (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) + joint_img[human_model.joint_part['face'], + 2] = ((joint_cam[human_model.joint_part['face'], 2].copy() / + (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) + + keypoints_2d = joint_img[:, :2].copy() + ann_3d = { + **ann, + 'keypoints_3d': joint_cam.tolist(), + } + ann_3d['keypoints'] = keypoints_2d.tolist() + + img['file_name'] = os.path.join(scene, file_name) + if video_name in splits: + val_annos.append(ann_3d) + val_imgs.append(img) + else: + train_annos.append(ann_3d) + train_imgs.append(img) + progress_bar.update() + + categoreis = [{ + 'supercategory': 'person', + 'id': 1, + 'name': 'person', + 'keypoints': human_model.joints_name, + 'skeleton': human_model.flip_pairs + }] + train_data = { + 'images': train_imgs, + 'annotations': train_annos, + 'categories': categoreis + } + val_data = { + 'images': val_imgs, + 'annotations': val_annos, + 'categories': categoreis + } + + mmengine.dump( + train_data, + osp.join(annotation_root, scene, 'train_3dkeypoint_annotation.json')) + mmengine.dump( + val_data, + osp.join(annotation_root, scene, 'val_3dkeypoint_annotation.json')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data-root', type=str, default='data/UBody') + parser.add_argument('--human-model-path', type=str, default='data/SMPLX') + parser.add_argument( + '--nproc', default=8, type=int, help='number of process') + args = parser.parse_args() + + split_path = f'{args.data_root}/splits/intra_scene_test_list.npy' + annotation_path = f'{args.data_root}/annotations' + + folders = os.listdir(annotation_path) + human_model = SMPLX(args.human_model_path) + splits = np.load(split_path) + + if args.nproc > 1: + mmengine.track_parallel_progress( + partial( + process_scene_anno, + annotation_root=annotation_path, + splits=splits, + human_model=human_model), folders, args.nproc) + else: + mmengine.track_progress( + partial( + process_scene_anno, + annotation_root=annotation_path, + splits=splits, + human_model=human_model), folders) From 8bb1b8f6b74805caa3465e8af39add258dbdd48d Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 22 Aug 2023 15:42:41 +0800 Subject: [PATCH 02/21] --other=add dataset to registry --- configs/_base_/datasets/ubody3d.py | 12 + ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 168 ++++++++++ mmpose/datasets/datasets/body3d/__init__.py | 3 +- .../datasets/body3d/ubody3d_dataset.py | 302 ++++++++++++++++++ 4 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 configs/_base_/datasets/ubody3d.py create mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py create mode 100644 mmpose/datasets/datasets/body3d/ubody3d_dataset.py diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py new file mode 100644 index 0000000000..8a40e9ffcf --- /dev/null +++ b/configs/_base_/datasets/ubody3d.py @@ -0,0 +1,12 @@ +dataset_info = dict( + dataset_name='ubody3d', + paper_info=dict( + author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li', + title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware' + 'Transformer', + container='IEEE Computer Society Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2023', + homepage='https://github.com/IDEA-Research/OSX', + ), + keypoint_info={}) diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py new file mode 100644 index 0000000000..b3c1c2db80 --- /dev/null +++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py @@ -0,0 +1,168 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=200, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) + +# learning policy +param_scheduler = [ + dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1)) + +# codec settings +# 3D keypoint normalization parameters +# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl' +target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04], + [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02], + [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01], + [2.55651315e-04, 7.11954606e-03, 9.81423866e-04], + [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02], + [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01], + [2.90583676e-03, -2.11363307e-01, -4.74210915e-02], + [5.67537804e-03, -4.35088906e-01, -9.76974016e-02], + [5.93884964e-03, -4.91891970e-01, -1.10666618e-01], + [7.37352083e-03, -5.83948619e-01, -1.31171400e-01], + [5.41920653e-03, -3.83931702e-01, -8.68145417e-02], + [2.95964662e-03, -1.87567488e-01, -4.34536934e-02], + [1.26585822e-03, -1.20170579e-01, -2.82526049e-02], + [4.67186639e-03, -3.83644089e-01, -8.55125784e-02], + [1.67648571e-03, -1.97007177e-01, -4.31368364e-02], + [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]], +target_std = [[0.11072244, 0.02238818, 0.07246294], + [0.15856311, 0.18933832, 0.20880479], + [0.19179935, 0.24320062, 0.24756193], + [0.11072181, 0.02238805, 0.07246253], + [0.15880454, 0.19977188, 0.2147063], + [0.18001944, 0.25052739, 0.24853247], + [0.05210694, 0.05211406, 0.06908241], + [0.09515367, 0.10133032, 0.12899733], + [0.11742458, 0.12648469, 0.16465091], + [0.12360297, 0.13085539, 0.16433336], + [0.14602232, 0.09707956, 0.13952731], + [0.24347532, 0.12982249, 0.20230181], + [0.2446877, 0.21501816, 0.23938235], + [0.13876084, 0.1008926, 0.1424411], + [0.23687529, 0.14491219, 0.20980829], + [0.24400695, 0.23975028, 0.25520584]] +# 2D keypoint normalization parameters +# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl' +keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141], + [530.68456967, 493.54259285], [529.36968722, 575.96448516], + [532.29767646, 421.28483336], [531.93946631, 494.72186795], + [529.71984447, 578.96110365], [532.93699382, 370.65225054], + [534.1101856, 317.90342311], [534.55416813, 304.24143901], + [534.86955004, 282.31030885], [534.11308566, 330.11296796], + [533.53637525, 376.2742511], [533.49380107, 391.72324565], + [533.52579142, 330.09494668], [532.50804964, 374.190479], + [532.72786934, 380.61615716]], +keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443], + [119.12412107, 50.53806215], [120.61688045, 56.38444891], + [101.95735275, 62.89636486], [106.24832897, 48.41178119], + [108.46734966, 54.58177071], [109.07369806, 68.70443672], + [111.20130351, 74.87287863], [111.63203838, 77.80542514], + [113.22330788, 79.90670556], [105.7145833, 73.27049436], + [107.05804267, 73.93175781], [107.97449418, 83.30391802], + [121.60675105, 74.25691526], [134.34378973, 77.48125087], + [131.79990652, 89.86721124]] +codec = dict( + type='ImagePoseLifting', + num_keypoints=17, + root_index=0, + remove_root=True, + target_mean=target_mean, + target_std=target_std, + keypoints_mean=keypoints_mean, + keypoints_std=keypoints_std) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(1, 1, 1), + dropout=0.5, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=16, + loss=dict(type='MSELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root', 'target_root_index', 'target_mean', + 'target_std')) +] +val_pipeline = train_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + causal=True, + keypoint_2d_src='gt', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + causal=True, + keypoint_2d_src='gt', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/mmpose/datasets/datasets/body3d/__init__.py b/mmpose/datasets/datasets/body3d/__init__.py index d5afeca578..2b52caeadd 100644 --- a/mmpose/datasets/datasets/body3d/__init__.py +++ b/mmpose/datasets/datasets/body3d/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from .h36m_dataset import Human36mDataset +from .ubody3d_dataset import UBody3dDataset -__all__ = ['Human36mDataset'] +__all__ = ['Human36mDataset', 'UBody3dDataset'] diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py new file mode 100644 index 0000000000..a7aca146ca --- /dev/null +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from collections import defaultdict +from typing import Callable, List, Optional, Sequence, Tuple, Union + +import numpy as np +from mmengine.fileio import exists, get_local_path +from mmengine.utils import is_abs + +from mmpose.datasets.datasets import BaseMocapDataset +from mmpose.registry import DATASETS + + +@DATASETS.register_module() +class UBody3dDataset(BaseMocapDataset): + """Ubody3d dataset for 3D human pose estimation. + + "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer", + CVPR'2023. More details can be found in the `paper + `__ . + + Ubody3D keypoints:: + + 0-24: 25 body keypoints, + 25-64: 40 hand keypoints, + 65-136: 72 face keypoints, + + In total, we have 137 keypoints for wholebody 3D pose estimation. + + Args: + ann_file (str): Annotation file path. Default: ''. + seq_len (int): Number of frames in a sequence. Default: 1. + seq_step (int): The interval for extracting frames from the video. + Default: 1. + multiple_target (int): If larger than 0, merge every + ``multiple_target`` sequence together. Default: 0. + multiple_target_step (int): The interval for merging sequence. Only + valid when ``multiple_target`` is larger than 0. Default: 0. + pad_video_seq (bool): Whether to pad the video so that poses will be + predicted for every frame in the video. Default: ``False``. + causal (bool): If set to ``True``, the rightmost input frame will be + the target frame. Otherwise, the middle input frame will be the + target frame. Default: ``True``. + subset_frac (float): The fraction to reduce dataset size. If set to 1, + the dataset size is not reduced. Default: 1. + keypoint_2d_src (str): Specifies 2D keypoint information options, which + should be one of the following options: + + - ``'gt'``: load from the annotation file + - ``'detection'``: load from a detection + result file of 2D keypoint + - 'pipeline': the information will be generated by the pipeline + + Default: ``'gt'``. + keypoint_2d_det_file (str, optional): The 2D keypoint detection file. + If set, 2d keypoint loaded from this file will be used instead of + ground-truth keypoints. This setting is only when + ``keypoint_2d_src`` is ``'detection'``. Default: ``None``. + factor_file (str, optional): The projection factors' file. If set, + factor loaded from this file will be used instead of calculated + factors. Default: ``None``. + camera_param_file (str): Cameras' parameters file. Default: ``None``. + data_mode (str): Specifies the mode of data samples: ``'topdown'`` or + ``'bottomup'``. In ``'topdown'`` mode, each data sample contains + one instance; while in ``'bottomup'`` mode, each data sample + contains all instances in a image. Default: ``'topdown'`` + metainfo (dict, optional): Meta information for dataset, such as class + information. Default: ``None``. + data_root (str, optional): The root directory for ``data_prefix`` and + ``ann_file``. Default: ``None``. + data_prefix (dict, optional): Prefix for training data. + Default: ``dict(img='')``. + filter_cfg (dict, optional): Config for filter data. Default: `None`. + indices (int or Sequence[int], optional): Support using first few + data in annotation file to facilitate training/testing on a smaller + dataset. Default: ``None`` which means using all ``data_infos``. + serialize_data (bool, optional): Whether to hold memory using + serialized objects, when enabled, data loader workers can use + shared RAM from master process instead of making a copy. + Default: ``True``. + pipeline (list, optional): Processing pipeline. Default: []. + test_mode (bool, optional): ``test_mode=True`` means in test phase. + Default: ``False``. + lazy_init (bool, optional): Whether to load annotation during + instantiation. In some cases, such as visualization, only the meta + information of the dataset is needed, which is not necessary to + load annotation file. ``Basedataset`` can skip load annotations to + save time by set ``lazy_init=False``. Default: ``False``. + max_refetch (int, optional): If ``Basedataset.prepare_data`` get a + None img. The maximum extra number of cycles to get a valid + image. Default: 1000. + """ + + METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py') + SUPPORTED_keypoint_2d_src = {'gt', 'detection', 'pipeline'} + + def __init__(self, + ann_file: str = '', + seq_len: int = 1, + seq_step: int = 1, + multiple_target: int = 0, + multiple_target_step: int = 0, + pad_video_seq: bool = False, + causal: bool = True, + subset_frac: float = 1.0, + keypoint_2d_src: str = 'gt', + keypoint_2d_det_file: Optional[str] = None, + factor_file: Optional[str] = None, + camera_param_file: Optional[str] = None, + data_mode: str = 'topdown', + metainfo: Optional[dict] = None, + data_root: Optional[str] = None, + data_prefix: dict = dict(img=''), + filter_cfg: Optional[dict] = None, + indices: Optional[Union[int, Sequence[int]]] = None, + serialize_data: bool = True, + pipeline: List[Union[dict, Callable]] = [], + test_mode: bool = False, + lazy_init: bool = False, + max_refetch: int = 1000): + # check keypoint_2d_src + self.keypoint_2d_src = keypoint_2d_src + if self.keypoint_2d_src not in self.SUPPORTED_keypoint_2d_src: + raise ValueError( + f'Unsupported `keypoint_2d_src` "{self.keypoint_2d_src}". ' + f'Supported options are {self.SUPPORTED_keypoint_2d_src}') + + if keypoint_2d_det_file: + if not is_abs(keypoint_2d_det_file): + self.keypoint_2d_det_file = osp.join(data_root, + keypoint_2d_det_file) + else: + self.keypoint_2d_det_file = keypoint_2d_det_file + + self.seq_step = seq_step + self.pad_video_seq = pad_video_seq + + if factor_file: + if not is_abs(factor_file): + factor_file = osp.join(data_root, factor_file) + assert exists(factor_file), (f'`factor_file`: {factor_file}' + 'does not exist.') + self.factor_file = factor_file + + if multiple_target > 0 and multiple_target_step == 0: + multiple_target_step = multiple_target + self.multiple_target_step = multiple_target_step + + super().__init__( + ann_file=ann_file, + seq_len=seq_len, + multiple_target=multiple_target, + causal=causal, + subset_frac=subset_frac, + camera_param_file=camera_param_file, + data_mode=data_mode, + metainfo=metainfo, + data_root=data_root, + data_prefix=data_prefix, + filter_cfg=filter_cfg, + indices=indices, + serialize_data=serialize_data, + pipeline=pipeline, + test_mode=test_mode, + lazy_init=lazy_init, + max_refetch=max_refetch) + + def get_sequence_indices(self) -> List[List[int]]: + """Split original videos into sequences and build frame indices. + + This method overrides the default one in the base class. + """ + imgnames = self.ann_data['imgname'] + video_frames = defaultdict(list) + for idx, imgname in enumerate(imgnames): + subj, action, camera = self._parse_h36m_imgname(imgname) + video_frames[(subj, action, camera)].append(idx) + + # build sample indices + sequence_indices = [] + _len = (self.seq_len - 1) * self.seq_step + 1 + _step = self.seq_step + + if self.multiple_target: + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) + seqs_from_video = [ + _indices[i:(i + self.multiple_target):_step] + for i in range(0, n_frame, self.multiple_target_step) + ][:(n_frame + self.multiple_target_step - + self.multiple_target) // self.multiple_target_step] + sequence_indices.extend(seqs_from_video) + + else: + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) + + if self.pad_video_seq: + # Pad the sequence so that every frame in the sequence will + # be predicted. + if self.causal: + frames_left = self.seq_len - 1 + frames_right = 0 + else: + frames_left = (self.seq_len - 1) // 2 + frames_right = frames_left + for i in range(n_frame): + pad_left = max(0, frames_left - i // _step) + pad_right = max( + 0, frames_right - (n_frame - 1 - i) // _step) + start = max(i % _step, i - frames_left * _step) + end = min(n_frame - (n_frame - 1 - i) % _step, + i + frames_right * _step + 1) + sequence_indices.append([_indices[0]] * pad_left + + _indices[start:end:_step] + + [_indices[-1]] * pad_right) + else: + seqs_from_video = [ + _indices[i:(i + _len):_step] + for i in range(0, n_frame - _len + 1) + ] + sequence_indices.extend(seqs_from_video) + + # reduce dataset size if needed + subset_size = int(len(sequence_indices) * self.subset_frac) + start = np.random.randint(0, len(sequence_indices) - subset_size + 1) + end = start + subset_size + + sequence_indices = sequence_indices[start:end] + + return sequence_indices + + def _load_annotations(self) -> Tuple[List[dict], List[dict]]: + instance_list, image_list = super()._load_annotations() + + h36m_data = self.ann_data + kpts_3d = h36m_data['S'] + + if self.keypoint_2d_src == 'detection': + assert exists(self.keypoint_2d_det_file), ( + f'`keypoint_2d_det_file`: `{self.keypoint_2d_det_file}`' + 'does not exist.') + kpts_2d = self._load_keypoint_2d_detection( + self.keypoint_2d_det_file) + assert kpts_2d.shape[0] == kpts_3d.shape[0], ( + f'Number of `kpts_2d` ({kpts_2d.shape[0]}) does not match ' + f'number of `kpts_3d` ({kpts_3d.shape[0]}).') + + assert kpts_2d.shape[2] == 3, ( + f'Expect `kpts_2d.shape[2]` == 3, but got ' + f'{kpts_2d.shape[2]}. Please check the format of ' + f'{self.keypoint_2d_det_file}') + + for idx, frame_ids in enumerate(self.sequence_indices): + kpt_2d = kpts_2d[frame_ids].astype(np.float32) + keypoints = kpt_2d[..., :2] + keypoints_visible = kpt_2d[..., 2] + instance_list[idx].update({ + 'keypoints': + keypoints, + 'keypoints_visible': + keypoints_visible + }) + if self.factor_file: + with get_local_path(self.factor_file) as local_path: + factors = np.load(local_path).astype(np.float32) + else: + factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32) + assert factors.shape[0] == kpts_3d.shape[0], ( + f'Number of `factors` ({factors.shape[0]}) does not match ' + f'number of `kpts_3d` ({kpts_3d.shape[0]}).') + + for idx, frame_ids in enumerate(self.sequence_indices): + factor = factors[frame_ids].astype(np.float32) + instance_list[idx].update({'factor': factor}) + + return instance_list, image_list + + @staticmethod + def _parse_h36m_imgname(imgname) -> Tuple[str, str, str]: + """Parse imgname to get information of subject, action and camera. + + A typical h36m image filename is like: + S1_Directions_1.54138969_000001.jpg + """ + subj, rest = osp.basename(imgname).split('_', 1) + action, rest = rest.split('.', 1) + camera, rest = rest.split('_', 1) + return subj, action, camera + + def get_camera_param(self, imgname) -> dict: + """Get camera parameters of a frame by its image name.""" + assert hasattr(self, 'camera_param') + subj, _, camera = self._parse_h36m_imgname(imgname) + return self.camera_param[(subj, camera)] + + def _load_keypoint_2d_detection(self, det_file): + """"Load 2D joint detection results from file.""" + with get_local_path(det_file) as local_path: + kpts_2d = np.load(local_path).astype(np.float32) + + return kpts_2d From 225fded8270f19b6cc25246d8e55cc2762e4cca5 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 22 Aug 2023 16:22:40 +0800 Subject: [PATCH 03/21] --ohter=update config --- configs/wholebody_3d_keypoint/README.md | 0 ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 62 +++++++++++++------ 2 files changed, 43 insertions(+), 19 deletions(-) create mode 100644 configs/wholebody_3d_keypoint/README.md rename configs/{body_3d_keypoint/pose_lift => wholebody_3d_keypoint}/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py (83%) diff --git a/configs/wholebody_3d_keypoint/README.md b/configs/wholebody_3d_keypoint/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py similarity index 83% rename from configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py rename to configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py index b3c1c2db80..132038ba93 100644 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py +++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py @@ -112,8 +112,39 @@ )) # base dataset settings -dataset_type = 'Human36mDataset' -data_root = 'data/h36m/' +dataset_type = 'UBody3dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [] +val_datasets = [] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json', + seq_len=1, + causal=True, + keypoint_2d_src='gt', + data_prefix=dict(img='images/'), + pipeline=[]) + val_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json', + data_prefix=dict(img='images/'), + pipeline=[]) + train_datasets.append(train_dataset) + val_datasets.append(val_dataset) # pipelines train_pipeline = [ @@ -133,30 +164,23 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, - ann_file='annotation_body3d/fps50/h36m_train.npz', - seq_len=1, - causal=True, - keypoint_2d_src='gt', - data_root=data_root, - data_prefix=dict(img='images/'), + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), + datasets=train_datasets, pipeline=train_pipeline, + test_mode=False, )) val_dataloader = dict( batch_size=64, num_workers=2, persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, - ann_file='annotation_body3d/fps50/h36m_test.npz', - seq_len=1, - causal=True, - keypoint_2d_src='gt', - data_root=data_root, - data_prefix=dict(img='images/'), - pipeline=train_pipeline, + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), + datasets=val_datasets, + pipeline=val_pipeline, + test_mode=True, )) test_dataloader = val_dataloader From 828799da3ba3e26d6c24993b8c3bd0a1dec186e2 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 22 Aug 2023 17:00:08 +0800 Subject: [PATCH 04/21] --other=add metainfo --- configs/_base_/datasets/ubody3d.py | 930 ++++++++++++++++++++++++++++- 1 file changed, 929 insertions(+), 1 deletion(-) diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py index 8a40e9ffcf..a971ef4614 100644 --- a/configs/_base_/datasets/ubody3d.py +++ b/configs/_base_/datasets/ubody3d.py @@ -9,4 +9,932 @@ year='2023', homepage='https://github.com/IDEA-Research/OSX', ), - keypoint_info={}) + keypoint_info={ + 0: + dict(name='Pelvis', id=0, color=[0, 255, 0], type='', swap=''), + 1: + dict( + name='L_Hip', id=1, color=[0, 255, 0], type='lower', swap='R_Hip'), + 2: + dict( + name='R_Hip', id=2, color=[0, 255, 0], type='lower', swap='L_Hip'), + 3: + dict( + name='L_Knee', + id=3, + color=[0, 255, 0], + type='lower', + swap='R_Knee'), + 4: + dict( + name='R_Knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 5: + dict( + name='L_Ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='R_Ankle'), + 6: + dict( + name='R_Ankle', + id=6, + color=[0, 255, 0], + type='lower', + swap='L_Ankle'), + 7: + dict(name='Neck', id=7, color=[0, 255, 0], type='upper', swap=''), + 8: + dict( + name='L_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='R_Shoulder'), + 9: + dict( + name='R_Shoulder', + id=9, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 10: + dict( + name='L_Elbow', + id=10, + color=[0, 255, 0], + type='upper', + swap='R_Elbow'), + 11: + dict( + name='R_Elbow', + id=11, + color=[0, 255, 0], + type='upper', + swap='L_Elbow'), + 12: + dict( + name='L_Wrist', + id=12, + color=[0, 255, 0], + type='upper', + swap='R_Wrist'), + 13: + dict( + name='R_Wrist', + id=13, + color=[0, 255, 0], + type='upper', + swap='L_Wrist'), + 14: + dict( + name='L_Big_toe', + id=14, + color=[0, 255, 0], + type='lower', + swap='R_Big_toe'), + 15: + dict( + name='L_Small_toe', + id=15, + color=[0, 255, 0], + type='lower', + swap='R_Small_toe'), + 16: + dict( + name='L_Heel', + id=16, + color=[0, 255, 0], + type='lower', + swap='R_Heel'), + 17: + dict( + name='R_Big_toe', + id=17, + color=[0, 255, 0], + type='lower', + swap='L_Big_toe'), + 18: + dict( + name='R_Small_toe', + id=18, + color=[0, 255, 0], + type='lower', + swap='L_Small_toe'), + 19: + dict( + name='R_Heel', + id=19, + color=[0, 255, 0], + type='lower', + swap='L_Heel'), + 20: + dict( + name='L_Ear', id=20, color=[0, 255, 0], type='upper', + swap='R_Ear'), + 21: + dict( + name='R_Ear', id=21, color=[0, 255, 0], type='upper', + swap='L_Ear'), + 22: + dict(name='L_Eye', id=22, color=[0, 255, 0], type='', swap='R_Eye'), + 23: + dict(name='R_Eye', id=23, color=[0, 255, 0], type='', swap='L_Eye'), + 24: + dict(name='Nose', id=24, color=[0, 255, 0], type='upper', swap=''), + 25: + dict( + name='L_Thumb_1', + id=25, + color=[255, 128, 0], + type='', + swap='R_Thumb_1'), + 26: + dict( + name='L_Thumb_2', + id=26, + color=[255, 128, 0], + type='', + swap='R_Thumb_2'), + 27: + dict( + name='L_Thumb_3', + id=27, + color=[255, 128, 0], + type='', + swap='R_Thumb_3'), + 28: + dict( + name='L_Thumb4', + id=28, + color=[255, 128, 0], + type='', + swap='R_Thumb_4'), + 29: + dict( + name='L_Index_1', + id=29, + color=[255, 128, 0], + type='', + swap='R_Index_1'), + 30: + dict( + name='L_Index_2', + id=30, + color=[255, 128, 0], + type='', + swap='R_Index_2'), + 31: + dict( + name='L_Index_3', + id=31, + color=[255, 128, 0], + type='', + swap='R_Index_3'), + 32: + dict( + name='L_Index_4', + id=32, + color=[255, 128, 0], + type='', + swap='R_Index_4'), + 33: + dict( + name='L_Middle_1', + id=33, + color=[255, 128, 0], + type='', + swap='R_Middle_1'), + 34: + dict( + name='L_Middle_2', + id=34, + color=[255, 128, 0], + type='', + swap='R_Middle_2'), + 35: + dict( + name='L_Middle_3', + id=35, + color=[255, 128, 0], + type='', + swap='R_Middle_3'), + 36: + dict( + name='L_Middle_4', + id=36, + color=[255, 128, 0], + type='', + swap='R_Middle_4'), + 37: + dict( + name='L_Ring_1', + id=37, + color=[255, 128, 0], + type='', + swap='R_Ring_1'), + 38: + dict( + name='L_Ring_2', + id=38, + color=[255, 128, 0], + type='', + swap='R_Ring_2'), + 39: + dict( + name='L_Ring_3', + id=39, + color=[255, 128, 0], + type='', + swap='R_Ring_3'), + 40: + dict( + name='L_Ring_4', + id=40, + color=[255, 128, 0], + type='', + swap='R_Ring_4'), + 41: + dict( + name='L_Pinky_1', + id=41, + color=[255, 128, 0], + type='', + swap='R_Pinky_1'), + 42: + dict( + name='L_Pinky_2', + id=42, + color=[255, 128, 0], + type='', + swap='R_Pinky_2'), + 43: + dict( + name='L_Pinky_3', + id=43, + color=[255, 128, 0], + type='', + swap='R_Pinky_3'), + 44: + dict( + name='L_Pinky_4', + id=44, + color=[255, 128, 0], + type='', + swap='R_Pinky_4'), + 45: + dict( + name='R_Thumb_1', + id=45, + color=[255, 128, 0], + type='', + swap='L_Thumb_1'), + 46: + dict( + name='R_Thumb_2', + id=46, + color=[255, 128, 0], + type='', + swap='L_Thumb_2'), + 47: + dict( + name='R_Thumb_3', + id=47, + color=[255, 128, 0], + type='', + swap='L_Thumb_3'), + 48: + dict( + name='R_Thumb_4', + id=48, + color=[255, 128, 0], + type='', + swap='L_Thumb4'), + 49: + dict( + name='R_Index_1', + id=49, + color=[255, 128, 0], + type='', + swap='L_Index_1'), + 50: + dict( + name='R_Index_2', + id=50, + color=[255, 128, 0], + type='', + swap='L_Index_2'), + 51: + dict( + name='R_Index_3', + id=51, + color=[255, 128, 0], + type='', + swap='L_Index_3'), + 52: + dict( + name='R_Index_4', + id=52, + color=[255, 128, 0], + type='', + swap='L_Index_4'), + 53: + dict( + name='R_Middle_1', + id=53, + color=[255, 128, 0], + type='', + swap='L_Middle_1'), + 54: + dict( + name='R_Middle_2', + id=54, + color=[255, 128, 0], + type='', + swap='L_Middle_2'), + 55: + dict( + name='R_Middle_3', + id=55, + color=[255, 128, 0], + type='', + swap='L_Middle_3'), + 56: + dict( + name='R_Middle_4', + id=56, + color=[255, 128, 0], + type='', + swap='L_Middle_4'), + 57: + dict( + name='R_Ring_1', + id=57, + color=[255, 128, 0], + type='', + swap='L_Ring_1'), + 58: + dict( + name='R_Ring_2', + id=58, + color=[255, 128, 0], + type='', + swap='L_Ring_2'), + 59: + dict( + name='R_Ring_3', + id=59, + color=[255, 128, 0], + type='', + swap='L_Ring_3'), + 60: + dict( + name='R_Ring_4', + id=60, + color=[255, 128, 0], + type='', + swap='L_Ring_4'), + 61: + dict( + name='R_Pinky_1', + id=61, + color=[255, 128, 0], + type='', + swap='L_Pinky_1'), + 62: + dict( + name='R_Pinky_2', + id=62, + color=[255, 128, 0], + type='', + swap='L_Pinky_2'), + 63: + dict( + name='R_Pinky_3', + id=63, + color=[255, 128, 0], + type='', + swap='L_Pinky_3'), + 64: + dict( + name='R_Pinky_4', + id=64, + color=[255, 128, 0], + type='', + swap='L_Pinky_4'), + 65: + dict(name='Face_1', id=65, color=[255, 255, 255], type='', swap=''), + 66: + dict(name='Face_2', id=66, color=[255, 255, 255], type='', swap=''), + 67: + dict( + name='Face_3', + id=67, + color=[255, 255, 255], + type='', + swap='Face_4'), + 68: + dict( + name='Face_4', + id=68, + color=[255, 255, 255], + type='', + swap='Face_3'), + 69: + dict( + name='Face_5', + id=69, + color=[255, 255, 255], + type='', + swap='Face_14'), + 70: + dict( + name='Face_6', + id=70, + color=[255, 255, 255], + type='', + swap='Face_13'), + 71: + dict( + name='Face_7', + id=71, + color=[255, 255, 255], + type='', + swap='Face_12'), + 72: + dict( + name='Face_8', + id=72, + color=[255, 255, 255], + type='', + swap='Face_11'), + 73: + dict( + name='Face_9', + id=73, + color=[255, 255, 255], + type='', + swap='Face_10'), + 74: + dict( + name='Face_10', + id=74, + color=[255, 255, 255], + type='', + swap='Face_9'), + 75: + dict( + name='Face_11', + id=75, + color=[255, 255, 255], + type='', + swap='Face_8'), + 76: + dict( + name='Face_12', + id=76, + color=[255, 255, 255], + type='', + swap='Face_7'), + 77: + dict( + name='Face_13', + id=77, + color=[255, 255, 255], + type='', + swap='Face_6'), + 78: + dict( + name='Face_14', + id=78, + color=[255, 255, 255], + type='', + swap='Face_5'), + 79: + dict(name='Face_15', id=79, color=[255, 255, 255], type='', swap=''), + 80: + dict(name='Face_16', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict(name='Face_17', id=81, color=[255, 255, 255], type='', swap=''), + 82: + dict(name='Face_18', id=82, color=[255, 255, 255], type='', swap=''), + 83: + dict( + name='Face_19', + id=83, + color=[255, 255, 255], + type='', + swap='Face_23'), + 84: + dict( + name='Face_20', + id=84, + color=[255, 255, 255], + type='', + swap='Face_22'), + 85: + dict(name='Face_21', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='Face_22', + id=86, + color=[255, 255, 255], + type='', + swap='Face_20'), + 87: + dict( + name='Face_23', + id=87, + color=[255, 255, 255], + type='', + swap='Face_19'), + 88: + dict( + name='Face_24', + id=88, + color=[255, 255, 255], + type='', + swap='Face_33'), + 89: + dict( + name='Face_25', + id=89, + color=[255, 255, 255], + type='', + swap='Face_32'), + 90: + dict( + name='Face_26', + id=90, + color=[255, 255, 255], + type='', + swap='Face_31'), + 91: + dict( + name='Face_27', + id=91, + color=[255, 255, 255], + type='', + swap='Face_30'), + 92: + dict( + name='Face_28', + id=92, + color=[255, 255, 255], + type='', + swap='Face_35'), + 93: + dict( + name='Face_29', + id=93, + color=[255, 255, 255], + type='', + swap='Face_34'), + 94: + dict( + name='Face_30', + id=94, + color=[255, 255, 255], + type='', + swap='Face_27'), + 95: + dict( + name='Face_31', + id=95, + color=[255, 255, 255], + type='', + swap='Face_26'), + 96: + dict( + name='Face_32', + id=96, + color=[255, 255, 255], + type='', + swap='Face_25'), + 97: + dict( + name='Face_33', + id=97, + color=[255, 255, 255], + type='', + swap='Face_24'), + 98: + dict( + name='Face_34', + id=98, + color=[255, 255, 255], + type='', + swap='Face_29'), + 99: + dict( + name='Face_35', + id=99, + color=[255, 255, 255], + type='', + swap='Face_28'), + 100: + dict( + name='Face_36', + id=100, + color=[255, 255, 255], + type='', + swap='Face_42'), + 101: + dict( + name='Face_37', + id=101, + color=[255, 255, 255], + type='', + swap='Face_41'), + 102: + dict( + name='Face_38', + id=102, + color=[255, 255, 255], + type='', + swap='Face_40'), + 103: + dict(name='Face_39', id=103, color=[255, 255, 255], type='', swap=''), + 104: + dict( + name='Face_40', + id=104, + color=[255, 255, 255], + type='', + swap='Face_38'), + 105: + dict( + name='Face_41', + id=105, + color=[255, 255, 255], + type='', + swap='Face_37'), + 106: + dict( + name='Face_42', + id=106, + color=[255, 255, 255], + type='', + swap='Face_36'), + 107: + dict( + name='Face_43', + id=107, + color=[255, 255, 255], + type='', + swap='Face_47'), + 108: + dict( + name='Face_44', + id=108, + color=[255, 255, 255], + type='', + swap='Face_46'), + 109: + dict(name='Face_45', id=109, color=[255, 255, 255], type='', swap=''), + 110: + dict( + name='Face_46', + id=110, + color=[255, 255, 255], + type='', + swap='Face_44'), + 111: + dict( + name='Face_47', + id=111, + color=[255, 255, 255], + type='', + swap='Face_43'), + 112: + dict( + name='Face_48', + id=112, + color=[255, 255, 255], + type='', + swap='Face_52'), + 113: + dict( + name='Face_49', + id=113, + color=[255, 255, 255], + type='', + swap='Face_51'), + 114: + dict(name='Face_50', id=114, color=[255, 255, 255], type='', swap=''), + 115: + dict( + name='Face_51', + id=115, + color=[255, 255, 255], + type='', + swap='Face_49'), + 116: + dict( + name='Face_52', + id=116, + color=[255, 255, 255], + type='', + swap='Face_48'), + 117: + dict( + name='Face_53', + id=117, + color=[255, 255, 255], + type='', + swap='Face_55'), + 118: + dict(name='Face_54', id=118, color=[255, 255, 255], type='', swap=''), + 119: + dict( + name='Face_55', + id=119, + color=[255, 255, 255], + type='', + swap='Face_53'), + 120: + dict( + name='Face_56', + id=120, + color=[255, 255, 255], + type='', + swap='Face_72'), + 121: + dict( + name='Face_57', + id=121, + color=[255, 255, 255], + type='', + swap='Face_71'), + 122: + dict( + name='Face_58', + id=122, + color=[255, 255, 255], + type='', + swap='Face_70'), + 123: + dict( + name='Face_59', + id=123, + color=[255, 255, 255], + type='', + swap='Face_69'), + 124: + dict( + name='Face_60', + id=124, + color=[255, 255, 255], + type='', + swap='Face_68'), + 125: + dict( + name='Face_61', + id=125, + color=[255, 255, 255], + type='', + swap='Face_67'), + 126: + dict( + name='Face_62', + id=126, + color=[255, 255, 255], + type='', + swap='Face_66'), + 127: + dict( + name='Face_63', + id=127, + color=[255, 255, 255], + type='', + swap='Face_65'), + 128: + dict(name='Face_64', id=128, color=[255, 255, 255], type='', swap=''), + 129: + dict( + name='Face_65', + id=129, + color=[255, 255, 255], + type='', + swap='Face_63'), + 130: + dict( + name='Face_66', + id=130, + color=[255, 255, 255], + type='', + swap='Face_62'), + 131: + dict( + name='Face_67', + id=131, + color=[255, 255, 255], + type='', + swap='Face_61'), + 132: + dict( + name='Face_68', + id=132, + color=[255, 255, 255], + type='', + swap='Face_60'), + 133: + dict( + name='Face_69', + id=133, + color=[255, 255, 255], + type='', + swap='Face_59'), + 134: + dict( + name='Face_70', + id=134, + color=[255, 255, 255], + type='', + swap='Face_58'), + 135: + dict( + name='Face_71', + id=135, + color=[255, 255, 255], + type='', + swap='Face_57'), + 136: + dict( + name='Face_72', + id=136, + color=[255, 255, 255], + type='', + swap='Face_56'), + }, + skeleton_info={ + 0: dict(link=('L_Hip', 'R_Hip'), id=0, color=[0, 255, 0]), + 1: dict(link=('L_Knee', 'R_Knee'), id=1, color=[0, 255, 0]), + 2: dict(link=('L_Ankle', 'R_Ankle'), id=2, color=[0, 255, 0]), + 3: dict(link=('L_Shoulder', 'R_Shoulder'), id=3, color=[0, 255, 0]), + 4: dict(link=('L_Elbow', 'R_Elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('L_Wrist', 'R_Wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('L_Big_toe', 'R_Big_toe'), id=6, color=[0, 255, 0]), + 7: dict(link=('L_Small_toe', 'R_Small_toe'), id=7, color=[0, 255, 0]), + 8: dict(link=('L_Heel', 'R_Heel'), id=8, color=[0, 255, 0]), + 9: dict(link=('L_Ear', 'R_Ear'), id=9, color=[0, 255, 0]), + 10: dict(link=('L_Eye', 'R_Eye'), id=10, color=[0, 255, 0]), + 11: dict(link=('L_Thumb_1', 'R_Thumb_1'), id=11, color=[255, 128, 0]), + 12: dict(link=('L_Thumb_2', 'R_Thumb_2'), id=12, color=[255, 128, 0]), + 13: dict(link=('L_Thumb_3', 'R_Thumb_3'), id=13, color=[255, 128, 0]), + 14: dict(link=('L_Thumb4', 'R_Thumb_4'), id=14, color=[255, 128, 0]), + 15: dict(link=('L_Index_1', 'R_Index_1'), id=15, color=[255, 128, 0]), + 16: dict(link=('L_Index_2', 'R_Index_2'), id=16, color=[255, 128, 0]), + 17: dict(link=('L_Index_3', 'R_Index_3'), id=17, color=[255, 128, 0]), + 18: dict(link=('L_Index_4', 'R_Index_4'), id=18, color=[255, 128, 0]), + 19: + dict(link=('L_Middle_1', 'R_Middle_1'), id=19, color=[255, 128, 0]), + 20: + dict(link=('L_Middle_2', 'R_Middle_2'), id=20, color=[255, 128, 0]), + 21: + dict(link=('L_Middle_3', 'R_Middle_3'), id=21, color=[255, 128, 0]), + 22: + dict(link=('L_Middle_4', 'R_Middle_4'), id=22, color=[255, 128, 0]), + 23: dict(link=('L_Ring_1', 'R_Ring_1'), id=23, color=[255, 128, 0]), + 24: dict(link=('L_Ring_2', 'R_Ring_2'), id=24, color=[255, 128, 0]), + 25: dict(link=('L_Ring_3', 'R_Ring_3'), id=25, color=[255, 128, 0]), + 26: dict(link=('L_Ring_4', 'R_Ring_4'), id=26, color=[255, 128, 0]), + 27: dict(link=('L_Pinky_1', 'R_Pinky_1'), id=27, color=[255, 128, 0]), + 28: dict(link=('L_Pinky_2', 'R_Pinky_2'), id=28, color=[255, 128, 0]), + 29: dict(link=('L_Pinky_3', 'R_Pinky_3'), id=29, color=[255, 128, 0]), + 30: dict(link=('L_Pinky_4', 'R_Pinky_4'), id=30, color=[255, 128, 0]), + 31: dict(link=('Face_3', 'Face_4'), id=31, color=[255, 255, 255]), + 32: dict(link=('Face_5', 'Face_14'), id=32, color=[255, 255, 255]), + 33: dict(link=('Face_6', 'Face_13'), id=33, color=[255, 255, 255]), + 34: dict(link=('Face_7', 'Face_12'), id=34, color=[255, 255, 255]), + 35: dict(link=('Face_8', 'Face_11'), id=35, color=[255, 255, 255]), + 36: dict(link=('Face_9', 'Face_10'), id=36, color=[255, 255, 255]), + 37: dict(link=('Face_19', 'Face_23'), id=37, color=[255, 255, 255]), + 38: dict(link=('Face_20', 'Face_22'), id=38, color=[255, 255, 255]), + 39: dict(link=('Face_24', 'Face_33'), id=39, color=[255, 255, 255]), + 40: dict(link=('Face_25', 'Face_32'), id=40, color=[255, 255, 255]), + 41: dict(link=('Face_26', 'Face_31'), id=41, color=[255, 255, 255]), + 42: dict(link=('Face_27', 'Face_30'), id=42, color=[255, 255, 255]), + 43: dict(link=('Face_28', 'Face_35'), id=43, color=[255, 255, 255]), + 44: dict(link=('Face_29', 'Face_34'), id=44, color=[255, 255, 255]), + 45: dict(link=('Face_36', 'Face_42'), id=45, color=[255, 255, 255]), + 46: dict(link=('Face_37', 'Face_41'), id=46, color=[255, 255, 255]), + 47: dict(link=('Face_38', 'Face_40'), id=47, color=[255, 255, 255]), + 48: dict(link=('Face_43', 'Face_47'), id=48, color=[255, 255, 255]), + 49: dict(link=('Face_44', 'Face_46'), id=49, color=[255, 255, 255]), + 50: dict(link=('Face_48', 'Face_52'), id=50, color=[255, 255, 255]), + 51: dict(link=('Face_49', 'Face_51'), id=51, color=[255, 255, 255]), + 52: dict(link=('Face_53', 'Face_55'), id=52, color=[255, 255, 255]), + 53: dict(link=('Face_56', 'Face_72'), id=53, color=[255, 255, 255]), + 54: dict(link=('Face_57', 'Face_71'), id=54, color=[255, 255, 255]), + 55: dict(link=('Face_58', 'Face_70'), id=55, color=[255, 255, 255]), + 56: dict(link=('Face_59', 'Face_69'), id=56, color=[255, 255, 255]), + 57: dict(link=('Face_60', 'Face_68'), id=57, color=[255, 255, 255]), + 58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]), + 59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]), + 60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]), + }) From 4dbdbe17581bc750ea57140ccd964cf32e313e50 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Fri, 25 Aug 2023 16:13:20 +0800 Subject: [PATCH 05/21] --other=update dataset --- .../datasets/body3d/ubody3d_dataset.py | 334 ++++++------------ .../dataset_converters/ubody_smplx_to_coco.py | 53 +-- 2 files changed, 133 insertions(+), 254 deletions(-) diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py index a7aca146ca..dca81ffc26 100644 --- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -1,18 +1,17 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from collections import defaultdict -from typing import Callable, List, Optional, Sequence, Tuple, Union +from typing import Optional import numpy as np from mmengine.fileio import exists, get_local_path -from mmengine.utils import is_abs +from xtcocotools.coco import COCO -from mmpose.datasets.datasets import BaseMocapDataset +from mmpose.datasets.datasets import BaseCocoStyleDataset from mmpose.registry import DATASETS @DATASETS.register_module() -class UBody3dDataset(BaseMocapDataset): +class UBody3dDataset(BaseCocoStyleDataset): """Ubody3d dataset for 3D human pose estimation. "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer", @@ -29,37 +28,11 @@ class UBody3dDataset(BaseMocapDataset): Args: ann_file (str): Annotation file path. Default: ''. - seq_len (int): Number of frames in a sequence. Default: 1. - seq_step (int): The interval for extracting frames from the video. - Default: 1. - multiple_target (int): If larger than 0, merge every - ``multiple_target`` sequence together. Default: 0. - multiple_target_step (int): The interval for merging sequence. Only - valid when ``multiple_target`` is larger than 0. Default: 0. - pad_video_seq (bool): Whether to pad the video so that poses will be - predicted for every frame in the video. Default: ``False``. - causal (bool): If set to ``True``, the rightmost input frame will be - the target frame. Otherwise, the middle input frame will be the - target frame. Default: ``True``. - subset_frac (float): The fraction to reduce dataset size. If set to 1, - the dataset size is not reduced. Default: 1. - keypoint_2d_src (str): Specifies 2D keypoint information options, which - should be one of the following options: - - - ``'gt'``: load from the annotation file - - ``'detection'``: load from a detection - result file of 2D keypoint - - 'pipeline': the information will be generated by the pipeline - - Default: ``'gt'``. - keypoint_2d_det_file (str, optional): The 2D keypoint detection file. - If set, 2d keypoint loaded from this file will be used instead of - ground-truth keypoints. This setting is only when - ``keypoint_2d_src`` is ``'detection'``. Default: ``None``. - factor_file (str, optional): The projection factors' file. If set, - factor loaded from this file will be used instead of calculated - factors. Default: ``None``. - camera_param_file (str): Cameras' parameters file. Default: ``None``. + bbox_file (str, optional): Detection result file path. If + ``bbox_file`` is set, detected bboxes loaded from this file will + be used instead of ground-truth bboxes. This setting is only for + evaluation, i.e., ignored when ``test_mode`` is ``False``. + Default: ``None``. data_mode (str): Specifies the mode of data samples: ``'topdown'`` or ``'bottomup'``. In ``'topdown'`` mode, each data sample contains one instance; while in ``'bottomup'`` mode, each data sample @@ -89,214 +62,117 @@ class UBody3dDataset(BaseMocapDataset): max_refetch (int, optional): If ``Basedataset.prepare_data`` get a None img. The maximum extra number of cycles to get a valid image. Default: 1000. + sample_interval (int, optional): The sample interval of the dataset. + Default: 1. """ METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py') - SUPPORTED_keypoint_2d_src = {'gt', 'detection', 'pipeline'} - - def __init__(self, - ann_file: str = '', - seq_len: int = 1, - seq_step: int = 1, - multiple_target: int = 0, - multiple_target_step: int = 0, - pad_video_seq: bool = False, - causal: bool = True, - subset_frac: float = 1.0, - keypoint_2d_src: str = 'gt', - keypoint_2d_det_file: Optional[str] = None, - factor_file: Optional[str] = None, - camera_param_file: Optional[str] = None, - data_mode: str = 'topdown', - metainfo: Optional[dict] = None, - data_root: Optional[str] = None, - data_prefix: dict = dict(img=''), - filter_cfg: Optional[dict] = None, - indices: Optional[Union[int, Sequence[int]]] = None, - serialize_data: bool = True, - pipeline: List[Union[dict, Callable]] = [], - test_mode: bool = False, - lazy_init: bool = False, - max_refetch: int = 1000): - # check keypoint_2d_src - self.keypoint_2d_src = keypoint_2d_src - if self.keypoint_2d_src not in self.SUPPORTED_keypoint_2d_src: - raise ValueError( - f'Unsupported `keypoint_2d_src` "{self.keypoint_2d_src}". ' - f'Supported options are {self.SUPPORTED_keypoint_2d_src}') - - if keypoint_2d_det_file: - if not is_abs(keypoint_2d_det_file): - self.keypoint_2d_det_file = osp.join(data_root, - keypoint_2d_det_file) - else: - self.keypoint_2d_det_file = keypoint_2d_det_file - self.seq_step = seq_step - self.pad_video_seq = pad_video_seq + def _load_annotations(self): + """Load data from annotations in COCO format.""" + + assert exists(self.ann_file), ( + f'Annotation file `{self.ann_file}`does not exist') + + with get_local_path(self.ann_file) as local_path: + self.coco = COCO(local_path) + # set the metainfo about categories, which is a list of dict + # and each dict contains the 'id', 'name', etc. about this category + self._metainfo['CLASSES'] = self.coco.loadCats(self.coco.getCatIds()) + + instance_list = [] + image_list = [] + + for img_id in self.coco.getImgIds(): + if img_id % self.sample_interval != 0: + continue + img = self.coco_loadImgs(img_id)[0] + img.update({ + 'img_id': + img_id, + 'img_path': + osp.join(self.data_prefix['img'], img['file_name']), + }) + image_list.append(img) + + ann_ids = self.coco.getAnnIds(imgIds=img_id) + for ann in self.coco.loadAnns(ann_ids): + if instance_info := self.parse_data_info( + dict(raw_ann_info=ann, raw_img_info=img)): + instance_list.append(instance_info) - if factor_file: - if not is_abs(factor_file): - factor_file = osp.join(data_root, factor_file) - assert exists(factor_file), (f'`factor_file`: {factor_file}' - 'does not exist.') - self.factor_file = factor_file + return instance_list, image_list - if multiple_target > 0 and multiple_target_step == 0: - multiple_target_step = multiple_target - self.multiple_target_step = multiple_target_step + def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: + """Parse raw COCO annotation of an instance. - super().__init__( - ann_file=ann_file, - seq_len=seq_len, - multiple_target=multiple_target, - causal=causal, - subset_frac=subset_frac, - camera_param_file=camera_param_file, - data_mode=data_mode, - metainfo=metainfo, - data_root=data_root, - data_prefix=data_prefix, - filter_cfg=filter_cfg, - indices=indices, - serialize_data=serialize_data, - pipeline=pipeline, - test_mode=test_mode, - lazy_init=lazy_init, - max_refetch=max_refetch) + Args: + raw_data_info (dict): Raw data information loaded from + ``ann_file``. It should have following contents: - def get_sequence_indices(self) -> List[List[int]]: - """Split original videos into sequences and build frame indices. + - ``'raw_ann_info'``: Raw annotation of an instance + - ``'raw_img_info'``: Raw information of the image that + contains the instance - This method overrides the default one in the base class. + Returns: + dict | None: Parsed instance annotation """ - imgnames = self.ann_data['imgname'] - video_frames = defaultdict(list) - for idx, imgname in enumerate(imgnames): - subj, action, camera = self._parse_h36m_imgname(imgname) - video_frames[(subj, action, camera)].append(idx) - - # build sample indices - sequence_indices = [] - _len = (self.seq_len - 1) * self.seq_step + 1 - _step = self.seq_step - - if self.multiple_target: - for _, _indices in sorted(video_frames.items()): - n_frame = len(_indices) - seqs_from_video = [ - _indices[i:(i + self.multiple_target):_step] - for i in range(0, n_frame, self.multiple_target_step) - ][:(n_frame + self.multiple_target_step - - self.multiple_target) // self.multiple_target_step] - sequence_indices.extend(seqs_from_video) - - else: - for _, _indices in sorted(video_frames.items()): - n_frame = len(_indices) - if self.pad_video_seq: - # Pad the sequence so that every frame in the sequence will - # be predicted. - if self.causal: - frames_left = self.seq_len - 1 - frames_right = 0 - else: - frames_left = (self.seq_len - 1) // 2 - frames_right = frames_left - for i in range(n_frame): - pad_left = max(0, frames_left - i // _step) - pad_right = max( - 0, frames_right - (n_frame - 1 - i) // _step) - start = max(i % _step, i - frames_left * _step) - end = min(n_frame - (n_frame - 1 - i) % _step, - i + frames_right * _step + 1) - sequence_indices.append([_indices[0]] * pad_left + - _indices[start:end:_step] + - [_indices[-1]] * pad_right) - else: - seqs_from_video = [ - _indices[i:(i + _len):_step] - for i in range(0, n_frame - _len + 1) - ] - sequence_indices.extend(seqs_from_video) + ann = raw_data_info['raw_ann_info'] + if 'bbox' not in ann or 'keypoints3d' not in ann: + return None - # reduce dataset size if needed - subset_size = int(len(sequence_indices) * self.subset_frac) - start = np.random.randint(0, len(sequence_indices) - subset_size + 1) - end = start + subset_size + img = raw_data_info['raw_img_info'] + img_w, img_h = img['width'], img['height'] - sequence_indices = sequence_indices[start:end] + # get bbox in shape [1, 4], formatted as xywh + x, y, w, h = ann['bbox'] + x1 = np.clip(x, 0, img_w - 1) + y1 = np.clip(y, 0, img_h - 1) + x2 = np.clip(x + w, 0, img_w - 1) + y2 = np.clip(y + h, 0, img_h - 1) - return sequence_indices + bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4) - def _load_annotations(self) -> Tuple[List[dict], List[dict]]: - instance_list, image_list = super()._load_annotations() + # keypoints in shape [1, K, 2] and keypoints_visible in [1, K] + _keypoints = np.array( + ann['keypoints'], dtype=np.float32).reshape(1, -1, 3) + keypoints = _keypoints[..., :2] + keypoints_visible = np.minimum(1, _keypoints[..., 2]) - h36m_data = self.ann_data - kpts_3d = h36m_data['S'] + _keypoints_3d = np.array( + ann['keypoints3d'], dtype=np.float32).reshape(1, -1, 4) + keypoints_3d = _keypoints_3d[..., :3] + keypoints_3d_visible = keypoints_visible - if self.keypoint_2d_src == 'detection': - assert exists(self.keypoint_2d_det_file), ( - f'`keypoint_2d_det_file`: `{self.keypoint_2d_det_file}`' - 'does not exist.') - kpts_2d = self._load_keypoint_2d_detection( - self.keypoint_2d_det_file) - assert kpts_2d.shape[0] == kpts_3d.shape[0], ( - f'Number of `kpts_2d` ({kpts_2d.shape[0]}) does not match ' - f'number of `kpts_3d` ({kpts_3d.shape[0]}).') - - assert kpts_2d.shape[2] == 3, ( - f'Expect `kpts_2d.shape[2]` == 3, but got ' - f'{kpts_2d.shape[2]}. Please check the format of ' - f'{self.keypoint_2d_det_file}') - - for idx, frame_ids in enumerate(self.sequence_indices): - kpt_2d = kpts_2d[frame_ids].astype(np.float32) - keypoints = kpt_2d[..., :2] - keypoints_visible = kpt_2d[..., 2] - instance_list[idx].update({ - 'keypoints': - keypoints, - 'keypoints_visible': - keypoints_visible - }) - if self.factor_file: - with get_local_path(self.factor_file) as local_path: - factors = np.load(local_path).astype(np.float32) + if 'num_keypoints' in ann: + num_keypoints = ann['num_keypoints'] else: - factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32) - assert factors.shape[0] == kpts_3d.shape[0], ( - f'Number of `factors` ({factors.shape[0]}) does not match ' - f'number of `kpts_3d` ({kpts_3d.shape[0]}).') - - for idx, frame_ids in enumerate(self.sequence_indices): - factor = factors[frame_ids].astype(np.float32) - instance_list[idx].update({'factor': factor}) - - return instance_list, image_list - - @staticmethod - def _parse_h36m_imgname(imgname) -> Tuple[str, str, str]: - """Parse imgname to get information of subject, action and camera. - - A typical h36m image filename is like: - S1_Directions_1.54138969_000001.jpg - """ - subj, rest = osp.basename(imgname).split('_', 1) - action, rest = rest.split('.', 1) - camera, rest = rest.split('_', 1) - return subj, action, camera - - def get_camera_param(self, imgname) -> dict: - """Get camera parameters of a frame by its image name.""" - assert hasattr(self, 'camera_param') - subj, _, camera = self._parse_h36m_imgname(imgname) - return self.camera_param[(subj, camera)] - - def _load_keypoint_2d_detection(self, det_file): - """"Load 2D joint detection results from file.""" - with get_local_path(det_file) as local_path: - kpts_2d = np.load(local_path).astype(np.float32) - - return kpts_2d + num_keypoints = np.count_nonzero(keypoints.max(axis=2)) + + scale = ann.get('scale', 0.0) + center = ann.get('center', np.array([0.0, 0.0])) + + instance_info = { + 'num_keypoints': num_keypoints, + 'keypoints': keypoints, + 'keypoints_visible': keypoints_visible, + 'keypoints_3d': keypoints_3d, + 'keypoints_3d_visible': keypoints_3d_visible, + 'bbox': bbox, + 'bbox_score': np.ones(1, dtype=np.float32), + 'scale': scale, + 'center': center, + 'id': ann['id'], + 'category_id': 1, + 'iscrowd': ann.get('iscrowd', 0), + 'segmentation': ann.get('segmentation', None), + 'img_path': img['img_path'], + 'img_id': ann['image_id'], + 'lifting_target': keypoints_3d[[-1]], + 'lifting_target_visible': keypoints_3d_visible[[-1]], + 'target_img_path': img['img_path'], + } + if 'crowdIndex' in img: + instance_info['crowd_index'] = img['crowdIndex'] + return instance_info diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index 8bb796377f..ac8b55683f 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -37,6 +37,8 @@ def __init__(self, human_model_path): use_pca=False, use_face_contour=True, **self.layer_args) + if torch.cuda.is_available(): + self.neutral_model = self.neutral_model.to('cuda:0') self.vertex_num = 10475 self.face = self.neutral_model.faces @@ -254,10 +256,10 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, dtype=np.float32) coord_valid = np.ones((human_model.joint_num), dtype=np.float32) - root_pose, body_pose, shape, trans = (human_model_param['root_pose'], - human_model_param['body_pose'], - human_model_param['shape'], - human_model_param['trans']) + root_pose = human_model_param['root_pose'] + body_pose = human_model_param['body_pose'] + shape = human_model_param['shape'] + trans = human_model_param['trans'] if 'lhand_pose' in human_model_param and human_model_param.get( 'lhand_valid', False): @@ -291,15 +293,18 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, coord_valid[human_model.orig_joint_part['face']] = 0 # init human model inputs - root_pose = torch.FloatTensor(root_pose).view(1, 3) - body_pose = torch.FloatTensor(body_pose).view(-1, 3) - lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3) - rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3) - jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3) - shape = torch.FloatTensor(shape).view(1, -1) - expr = torch.FloatTensor(expr).view(1, -1) - trans = torch.FloatTensor(trans).view(1, -1) - zero_pose = torch.zeros((1, 3), dtype=torch.float32) + device = torch.device( + 'cuda') if torch.cuda.is_available() else torch.device('cpu') + root_pose = torch.FloatTensor(root_pose, device=device).view(1, 3) + body_pose = torch.FloatTensor(body_pose, device=device).view(-1, 3) + lhand_pose = torch.FloatTensor(lhand_pose, device=device).view(-1, 3) + rhand_pose = torch.FloatTensor(rhand_pose, device=device).view(-1, 3) + jaw_pose = torch.FloatTensor(jaw_pose, device=device).view(-1, 3) + shape = torch.FloatTensor(shape, device=device).view(1, -1) + expr = torch.FloatTensor(expr, device=device).view(1, -1) + trans = torch.FloatTensor(trans, device=device).view(1, -1) + zero_pose = torch.zeros((1, 3), dtype=torch.float32, device=device) + with torch.no_grad(): output = human_model.neutral_model( betas=shape, @@ -346,23 +351,21 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, 2] = ((joint_cam[human_model.joint_part['face'], 2].copy() / (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) - keypoints_2d = joint_img[:, :2].copy() - ann_3d = { - **ann, - 'keypoints_3d': joint_cam.tolist(), - } - ann_3d['keypoints'] = keypoints_2d.tolist() - + keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid], + axis=1) + keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1) + ann['keypoints'] = keypoints_2d.tolist() + ann['keypoints_3d'] = keypoints_3d.tolist() img['file_name'] = os.path.join(scene, file_name) if video_name in splits: - val_annos.append(ann_3d) + val_annos.append(ann) val_imgs.append(img) else: - train_annos.append(ann_3d) + train_annos.append(ann) train_imgs.append(img) progress_bar.update() - categoreis = [{ + categories = [{ 'supercategory': 'person', 'id': 1, 'name': 'person', @@ -372,12 +375,12 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, train_data = { 'images': train_imgs, 'annotations': train_annos, - 'categories': categoreis + 'categories': categories } val_data = { 'images': val_imgs, 'annotations': val_annos, - 'categories': categoreis + 'categories': categories } mmengine.dump( From c04c835119f28d4947fc90607fe26fd3f54a90d2 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Fri, 25 Aug 2023 16:21:53 +0800 Subject: [PATCH 06/21] --other=update config --- ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 72 ++----------------- 1 file changed, 5 insertions(+), 67 deletions(-) diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py index 132038ba93..0b418c6607 100644 --- a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py +++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py @@ -1,4 +1,4 @@ -_base_ = ['../../../_base_/default_runtime.py'] +_base_ = ['../../_base_/default_runtime.py'] vis_backends = [ dict(type='LocalVisBackend'), @@ -28,76 +28,15 @@ max_keep_ckpts=1)) # codec settings -# 3D keypoint normalization parameters -# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl' -target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04], - [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02], - [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01], - [2.55651315e-04, 7.11954606e-03, 9.81423866e-04], - [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02], - [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01], - [2.90583676e-03, -2.11363307e-01, -4.74210915e-02], - [5.67537804e-03, -4.35088906e-01, -9.76974016e-02], - [5.93884964e-03, -4.91891970e-01, -1.10666618e-01], - [7.37352083e-03, -5.83948619e-01, -1.31171400e-01], - [5.41920653e-03, -3.83931702e-01, -8.68145417e-02], - [2.95964662e-03, -1.87567488e-01, -4.34536934e-02], - [1.26585822e-03, -1.20170579e-01, -2.82526049e-02], - [4.67186639e-03, -3.83644089e-01, -8.55125784e-02], - [1.67648571e-03, -1.97007177e-01, -4.31368364e-02], - [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]], -target_std = [[0.11072244, 0.02238818, 0.07246294], - [0.15856311, 0.18933832, 0.20880479], - [0.19179935, 0.24320062, 0.24756193], - [0.11072181, 0.02238805, 0.07246253], - [0.15880454, 0.19977188, 0.2147063], - [0.18001944, 0.25052739, 0.24853247], - [0.05210694, 0.05211406, 0.06908241], - [0.09515367, 0.10133032, 0.12899733], - [0.11742458, 0.12648469, 0.16465091], - [0.12360297, 0.13085539, 0.16433336], - [0.14602232, 0.09707956, 0.13952731], - [0.24347532, 0.12982249, 0.20230181], - [0.2446877, 0.21501816, 0.23938235], - [0.13876084, 0.1008926, 0.1424411], - [0.23687529, 0.14491219, 0.20980829], - [0.24400695, 0.23975028, 0.25520584]] -# 2D keypoint normalization parameters -# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl' -keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141], - [530.68456967, 493.54259285], [529.36968722, 575.96448516], - [532.29767646, 421.28483336], [531.93946631, 494.72186795], - [529.71984447, 578.96110365], [532.93699382, 370.65225054], - [534.1101856, 317.90342311], [534.55416813, 304.24143901], - [534.86955004, 282.31030885], [534.11308566, 330.11296796], - [533.53637525, 376.2742511], [533.49380107, 391.72324565], - [533.52579142, 330.09494668], [532.50804964, 374.190479], - [532.72786934, 380.61615716]], -keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443], - [119.12412107, 50.53806215], [120.61688045, 56.38444891], - [101.95735275, 62.89636486], [106.24832897, 48.41178119], - [108.46734966, 54.58177071], [109.07369806, 68.70443672], - [111.20130351, 74.87287863], [111.63203838, 77.80542514], - [113.22330788, 79.90670556], [105.7145833, 73.27049436], - [107.05804267, 73.93175781], [107.97449418, 83.30391802], - [121.60675105, 74.25691526], [134.34378973, 77.48125087], - [131.79990652, 89.86721124]] codec = dict( - type='ImagePoseLifting', - num_keypoints=17, - root_index=0, - remove_root=True, - target_mean=target_mean, - target_std=target_std, - keypoints_mean=keypoints_mean, - keypoints_std=keypoints_std) + type='ImagePoseLifting', num_keypoints=137, root_index=0, remove_root=True) # model settings model = dict( type='PoseLifter', backbone=dict( type='TCN', - in_channels=2 * 17, + in_channels=2 * 137, stem_channels=1024, num_blocks=2, kernel_sizes=(1, 1, 1), @@ -106,7 +45,7 @@ head=dict( type='TemporalRegressionHead', in_channels=1024, - num_joints=16, + num_joints=137, loss=dict(type='MSELoss'), decoder=codec, )) @@ -152,8 +91,7 @@ dict( type='PackPoseInputs', meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', - 'target_root', 'target_root_index', 'target_mean', - 'target_std')) + 'target_root', 'target_root_index')) ] val_pipeline = train_pipeline From 5796f9c323e932028b390616bec75cac21b442cb Mon Sep 17 00:00:00 2001 From: xiexinch Date: Fri, 25 Aug 2023 16:38:57 +0800 Subject: [PATCH 07/21] --fix=fix script --- .../dataset_converters/ubody_smplx_to_coco.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index ac8b55683f..2598b9a469 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -214,11 +214,12 @@ def cam2pixel(cam_coord, f, c): def process_scene_anno(scene: str, annotation_root: str, splits: np.array, - human_model: SMPLX): + human_model_path: str): annos = read_annotation_file( osp.join(annotation_root, scene, 'smplx_annotation.json')) keypoint_annos = COCO( osp.join(annotation_root, scene, 'keypoint_annotation.json')) + human_model = SMPLX(human_model_path) train_annos = [] val_annos = [] @@ -294,15 +295,15 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, # init human model inputs device = torch.device( - 'cuda') if torch.cuda.is_available() else torch.device('cpu') - root_pose = torch.FloatTensor(root_pose, device=device).view(1, 3) - body_pose = torch.FloatTensor(body_pose, device=device).view(-1, 3) - lhand_pose = torch.FloatTensor(lhand_pose, device=device).view(-1, 3) - rhand_pose = torch.FloatTensor(rhand_pose, device=device).view(-1, 3) - jaw_pose = torch.FloatTensor(jaw_pose, device=device).view(-1, 3) - shape = torch.FloatTensor(shape, device=device).view(1, -1) - expr = torch.FloatTensor(expr, device=device).view(1, -1) - trans = torch.FloatTensor(trans, device=device).view(1, -1) + 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') + root_pose = torch.FloatTensor(root_pose).to(device).view(1, 3) + body_pose = torch.FloatTensor(body_pose).to(device).view(-1, 3) + lhand_pose = torch.FloatTensor(lhand_pose).to(device).view(-1, 3) + rhand_pose = torch.FloatTensor(rhand_pose).to(device).view(-1, 3) + jaw_pose = torch.FloatTensor(jaw_pose).to(device).view(-1, 3) + shape = torch.FloatTensor(shape).to(device).view(1, -1) + expr = torch.FloatTensor(expr).to(device).view(1, -1) + trans = torch.FloatTensor(trans).to(device).view(1, -1) zero_pose = torch.zeros((1, 3), dtype=torch.float32, device=device) with torch.no_grad(): @@ -318,7 +319,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, reye_pose=zero_pose, expression=expr) - joint_cam = output.joints[0].numpy()[human_model.joint_idx, :] + joint_cam = output.joints[0].cpu().numpy()[human_model.joint_idx, :] joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt']) @@ -351,6 +352,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, 2] = ((joint_cam[human_model.joint_part['face'], 2].copy() / (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) + coord_valid = coord_valid.reshape((-1, 1)) keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid], axis=1) keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1) @@ -403,7 +405,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, annotation_path = f'{args.data_root}/annotations' folders = os.listdir(annotation_path) - human_model = SMPLX(args.human_model_path) + human_model_path = args.human_model_path splits = np.load(split_path) if args.nproc > 1: @@ -412,11 +414,11 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, process_scene_anno, annotation_root=annotation_path, splits=splits, - human_model=human_model), folders, args.nproc) + human_model_path=human_model_path), folders, args.nproc) else: mmengine.track_progress( partial( process_scene_anno, annotation_root=annotation_path, splits=splits, - human_model=human_model), folders) + human_model_path=human_model_path), folders) From 273ef8446079bca9659c5a56d0c0996d788ccde0 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Fri, 25 Aug 2023 17:09:34 +0800 Subject: [PATCH 08/21] --other=fix --- tools/dataset_converters/ubody_smplx_to_coco.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index 2598b9a469..99bbfa3fda 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -244,6 +244,9 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, if not os.path.exists(img_path): progress_bar.update() continue + if aid not in annos: + progress_bar.update() + continue smplx_param = annos[str(aid)] human_model_param = smplx_param['smplx_param'] From d78287c934e66bcbe19046cec3221052bc973d40 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 29 Aug 2023 15:22:56 +0800 Subject: [PATCH 09/21] --feat=add simple mpjpe metric --- configs/_base_/datasets/ubody3d.py | 4 +- ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 6 +- .../datasets/body3d/ubody3d_dataset.py | 6 +- mmpose/evaluation/metrics/__init__.py | 3 +- .../metrics/simple_keypoint_3d_metrics.py | 119 ++++++++++++++++++ 5 files changed, 130 insertions(+), 8 deletions(-) create mode 100644 mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py index a971ef4614..e2dfe0c570 100644 --- a/configs/_base_/datasets/ubody3d.py +++ b/configs/_base_/datasets/ubody3d.py @@ -937,4 +937,6 @@ 58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]), 59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]), 60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]), - }) + }, + joint_weights=[1.] * 137, + sigmas=[]) diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py index 0b418c6607..751905b79e 100644 --- a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py +++ b/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py @@ -45,7 +45,7 @@ head=dict( type='TemporalRegressionHead', in_channels=1024, - num_joints=137, + num_joints=136, loss=dict(type='MSELoss'), decoder=codec, )) @@ -124,7 +124,7 @@ # evaluators val_evaluator = [ - dict(type='MPJPE', mode='mpjpe'), - dict(type='MPJPE', mode='p-mpjpe') + dict(type='SimpleMPJPE', mode='mpjpe'), + dict(type='SimpleMPJPE', mode='p-mpjpe') ] test_evaluator = val_evaluator diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py index dca81ffc26..468553abf5 100644 --- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -86,7 +86,7 @@ def _load_annotations(self): for img_id in self.coco.getImgIds(): if img_id % self.sample_interval != 0: continue - img = self.coco_loadImgs(img_id)[0] + img = self.coco.loadImgs(img_id)[0] img.update({ 'img_id': img_id, @@ -119,7 +119,7 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: """ ann = raw_data_info['raw_ann_info'] - if 'bbox' not in ann or 'keypoints3d' not in ann: + if 'bbox' not in ann or 'keypoints_3d' not in ann: return None img = raw_data_info['raw_img_info'] @@ -141,7 +141,7 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: keypoints_visible = np.minimum(1, _keypoints[..., 2]) _keypoints_3d = np.array( - ann['keypoints3d'], dtype=np.float32).reshape(1, -1, 4) + ann['keypoints_3d'], dtype=np.float32).reshape(1, -1, 4) keypoints_3d = _keypoints_3d[..., :3] keypoints_3d_visible = keypoints_visible diff --git a/mmpose/evaluation/metrics/__init__.py b/mmpose/evaluation/metrics/__init__.py index ac7e21b5cc..2c2843cebe 100644 --- a/mmpose/evaluation/metrics/__init__.py +++ b/mmpose/evaluation/metrics/__init__.py @@ -6,9 +6,10 @@ from .keypoint_3d_metrics import MPJPE from .keypoint_partition_metric import KeypointPartitionMetric from .posetrack18_metric import PoseTrack18Metric +from .simple_keypoint_3d_metrics import SimpleMPJPE __all__ = [ 'CocoMetric', 'PCKAccuracy', 'MpiiPCKAccuracy', 'JhmdbPCKAccuracy', 'AUC', 'EPE', 'NME', 'PoseTrack18Metric', 'CocoWholeBodyMetric', - 'KeypointPartitionMetric', 'MPJPE' + 'KeypointPartitionMetric', 'MPJPE', 'SimpleMPJPE' ] diff --git a/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py b/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py new file mode 100644 index 0000000000..dc0065d5b9 --- /dev/null +++ b/mmpose/evaluation/metrics/simple_keypoint_3d_metrics.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence + +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger + +from mmpose.registry import METRICS +from ..functional import keypoint_mpjpe + + +@METRICS.register_module() +class SimpleMPJPE(BaseMetric): + """MPJPE evaluation metric. + + Calculate the mean per-joint position error (MPJPE) of keypoints. + + Note: + - length of dataset: N + - num_keypoints: K + - number of keypoint dimensions: D (typically D = 2) + + Args: + mode (str): Method to align the prediction with the + ground truth. Supported options are: + + - ``'mpjpe'``: no alignment will be applied + - ``'p-mpjpe'``: align in the least-square sense in scale + - ``'n-mpjpe'``: align in the least-square sense in + scale, rotation, and translation. + + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be ``'cpu'`` or + ``'gpu'``. Default: ``'cpu'``. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, ``self.default_prefix`` + will be used instead. Default: ``None``. + skip_list (list, optional): The list of subject and action combinations + to be skipped. Default: []. + """ + + ALIGNMENT = {'mpjpe': 'none', 'p-mpjpe': 'procrustes', 'n-mpjpe': 'scale'} + + def __init__(self, + mode: str = 'mpjpe', + collect_device: str = 'cpu', + prefix: Optional[str] = None, + skip_list: List[str] = []) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + allowed_modes = self.ALIGNMENT.keys() + if mode not in allowed_modes: + raise KeyError("`mode` should be 'mpjpe', 'p-mpjpe', or " + f"'n-mpjpe', but got '{mode}'.") + + self.mode = mode + self.skip_list = skip_list + + def process(self, data_batch: Sequence[dict], + data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (Sequence[dict]): A batch of data + from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from + the model. + """ + for data_sample in data_samples: + # predicted keypoints coordinates, [T, K, D] + pred_coords = data_sample['pred_instances']['keypoints'] + if pred_coords.ndim == 4: + pred_coords = np.squeeze(pred_coords, axis=0) + # ground truth data_info + gt = data_sample['gt_instances'] + # ground truth keypoints coordinates, [T, K, D] + gt_coords = gt['lifting_target'] + # ground truth keypoints_visible, [T, K, 1] + mask = gt['lifting_target_visible'].astype(bool).reshape( + gt_coords.shape[0], -1) + + result = { + 'pred_coords': pred_coords, + 'gt_coords': gt_coords, + 'mask': mask, + } + + self.results.append(result) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are the corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + # pred_coords: [N, K, D] + pred_coords = np.concatenate( + [result['pred_coords'] for result in results]) + # gt_coords: [N, K, D] + gt_coords = np.concatenate([result['gt_coords'] for result in results]) + # mask: [N, K] + mask = np.concatenate([result['mask'] for result in results]) + + error_name = self.mode.upper() + + logger.info(f'Evaluating {self.mode.upper()}...') + return { + error_name: + keypoint_mpjpe(pred_coords, gt_coords, mask, + self.ALIGNMENT[self.mode]) + } From 9f9b9937a4439717168e9b83f3dd5e6bff8611d1 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 5 Sep 2023 14:58:01 +0800 Subject: [PATCH 10/21] --other=convert to coco and h36m --- .../dataset_converters/ubody_smplx_to_coco.py | 66 +++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index 99bbfa3fda..a3af141c4d 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -355,12 +355,70 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, 2] = ((joint_cam[human_model.joint_part['face'], 2].copy() / (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0]) - coord_valid = coord_valid.reshape((-1, 1)) - keypoints_2d = np.concatenate([joint_img[:, :2].copy(), coord_valid], - axis=1) - keypoints_3d = np.concatenate([joint_img, coord_valid], axis=1) + keypoints_2d = joint_img[:, :2].copy() + keypoints_3d = joint_img.copy() + keypoints_valid = coord_valid.reshape((-1, 1)) + + # map to COCO keypoint order + coco_keypoint_names = [ + 'Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder', + 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip', + 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle' + ] + coco_keypoint_idx = [ + human_model.joints_name.index(name) for name in coco_keypoint_names + ] + coco_3d_keypoints = keypoints_3d[coco_keypoint_idx, :] + coco_2d_keypoints = keypoints_2d[coco_keypoint_idx, :] + coco_keypoint_valid = keypoints_valid[coco_keypoint_idx, :] + + # map to H36M keypoint order + h36m_keypoint_names = [ + 'Pelvis', 'R_Hip', 'R_Knee', 'R_Ankle', 'L_Hip', 'L_Knee', + 'L_Ankle', 'Neck', 'L_Shoulder', 'L_Elbow', 'L_Wrist', + 'R_Shoulder', 'R_Elbow', 'R_Wrist' + ] + # interplate 'spine' and 'neck_base' + pelvis_id = human_model.joints_name.index('Pelvis') + neck_id = human_model.joints_name.index('Neck') + nose_id = human_model.joints_name.index('Nose') + spine_3d = (keypoints_3d[pelvis_id, :] + keypoints_3d[neck_id, :]) / 2 + neck_base_3d = (keypoints_3d[neck_id, :] + + keypoints_3d[nose_id, :]) / 2 + spine_valid = keypoints_valid[pelvis_id] * keypoints_valid[neck_id] + neck_base_valid = keypoints_valid[neck_id] * keypoints_valid[nose_id] + + h36m_keypoint_idx = [ + human_model.joints_name.index(name) for name in h36m_keypoint_names + ] + h36m_3d_keypoints = keypoints_3d[h36m_keypoint_idx, :] + h36m_2d_keypoints = keypoints_2d[h36m_keypoint_idx, :] + h36m_3d_keypoints = np.concatenate([ + h36m_3d_keypoints[:7, :], spine_3d, h36m_3d_keypoints[7, :], + neck_base_3d, h36m_3d_keypoints[8:] + ], + axis=0) + h36m_2d_keypoints = np.concatenate([ + h36m_2d_keypoints[:7, :], spine_3d, h36m_2d_keypoints[7, :], + neck_base_3d, h36m_3d_keypoints[8:] + ], + axis=0) + h36m_keypoint_valid = keypoints_valid[h36m_keypoint_idx] + h36m_keypoint_valid = np.concatenate([ + h36m_keypoint_valid[:7], spine_valid, h36m_keypoint_valid[7], + neck_base_valid, h36m_keypoint_valid[8:] + ], + axis=0) + ann['keypoints'] = keypoints_2d.tolist() ann['keypoints_3d'] = keypoints_3d.tolist() + ann['keypoints_valid'] = keypoints_valid.tolist() + ann['coco_keypoints'] = coco_2d_keypoints.tolist() + ann['coco_keypoints_3d'] = coco_3d_keypoints.tolist() + ann['coco_keypoints_valid'] = coco_keypoint_valid.tolist() + ann['h36m_keypoints'] = h36m_2d_keypoints.tolist() + ann['h36m_keypoints_3d'] = h36m_3d_keypoints.tolist() + ann['h36m_keypoints_valid'] = h36m_keypoint_valid.tolist() img['file_name'] = os.path.join(scene, file_name) if video_name in splits: val_annos.append(ann) From 7f9a414aac79f2b62429bdc917f8dd18721c649d Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 11:29:00 +0800 Subject: [PATCH 11/21] --other=refactor dataset --- ...ose-lift_motionbert-ft_8xb32-120e_ubody.py | 192 ++++++++++++++++++ ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 0 configs/wholebody_3d_keypoint/README.md | 0 .../datasets/base/base_mocap_dataset.py | 16 +- .../datasets/body3d/ubody3d_dataset.py | 190 ++++++++++++++--- .../dataset_converters/ubody_smplx_to_coco.py | 59 +----- 6 files changed, 368 insertions(+), 89 deletions(-) create mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py rename configs/{wholebody_3d_keypoint => body_3d_keypoint/pose_lift}/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py (100%) delete mode 100644 configs/wholebody_3d_keypoint/README.md diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py new file mode 100644 index 0000000000..900922b87c --- /dev/null +++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py @@ -0,0 +1,192 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=120, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=120, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), + test_cfg=dict(flip_test=True), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/' + 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'), +) + +# base dataset settings +dataset_type = 'UBody3dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +# mapping +ubody_h36m = [ + (0, 0), + (2, 1), + (4, 2), + (6, 3), + (1, 4), + (3, 5), + (5, 6), + ((0, 7), 7), + (7, 8), + ((7, 24), 9), + (24, 10), + (8, 11), + (10, 12), + (12, 13), + (9, 14), + (11, 15), + (13, 16), +] + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [] +val_datasets = [] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json', + seq_len=1, + multiple_target=120, + multiple_target_step=60, + data_prefix=dict(img='images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m) + ], + ) + if scene in ['Speech', 'Movie']: + continue + val_dataset = dict( + type=dataset_type, + ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json', + seq_len=1, + seq_step=1, + multiple_target=243, + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m) + ], + test_mode=True, + ) + train_datasets.append(train_dataset) + val_datasets.append(val_dataset) + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(center_mode='static', center_x=0.), + target_flip_cfg=dict(center_mode='static', center_x=0.), + flip_label=True), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + datasets=train_datasets, + metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), + pipeline=train_pipeline, + test_mode=False)) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), + datasets=val_datasets, + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='SimpleMPJPE', mode='mpjpe'), + dict(type='SimpleMPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py similarity index 100% rename from configs/wholebody_3d_keypoint/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py rename to configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py diff --git a/configs/wholebody_3d_keypoint/README.md b/configs/wholebody_3d_keypoint/README.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py index 290edafed0..b06d934ac5 100644 --- a/mmpose/datasets/datasets/base/base_mocap_dataset.py +++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py @@ -96,8 +96,7 @@ def __init__(self, assert exists(_ann_file), ( f'Annotation file `{_ann_file}` does not exist.') - with get_local_path(_ann_file) as local_path: - self.ann_data = np.load(local_path) + self._load_ann_file(_ann_file) self.camera_param_file = camera_param_file if self.camera_param_file: @@ -137,6 +136,19 @@ def __init__(self, lazy_init=lazy_init, max_refetch=max_refetch) + def _load_ann_file(self, ann_file: str) -> dict: + """Load annotation file to get image information. + + Args: + ann_file (str): Annotation file path. + + Returns: + dict: Annotation information. + """ + + with get_local_path(ann_file) as local_path: + self.ann_data = np.load(local_path) + @classmethod def _load_metainfo(cls, metainfo: dict = None) -> dict: """Collect meta information from the dictionary of meta. diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py index 468553abf5..236bc380ad 100644 --- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -1,17 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from typing import Optional +from collections import defaultdict +from typing import List, Optional, Tuple import numpy as np -from mmengine.fileio import exists, get_local_path +from mmengine.fileio import get_local_path from xtcocotools.coco import COCO -from mmpose.datasets.datasets import BaseCocoStyleDataset +from mmpose.datasets.datasets import BaseMocapDataset from mmpose.registry import DATASETS @DATASETS.register_module() -class UBody3dDataset(BaseCocoStyleDataset): +class UBody3dDataset(BaseMocapDataset): """Ubody3d dataset for 3D human pose estimation. "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer", @@ -28,11 +29,15 @@ class UBody3dDataset(BaseCocoStyleDataset): Args: ann_file (str): Annotation file path. Default: ''. - bbox_file (str, optional): Detection result file path. If - ``bbox_file`` is set, detected bboxes loaded from this file will - be used instead of ground-truth bboxes. This setting is only for - evaluation, i.e., ignored when ``test_mode`` is ``False``. - Default: ``None``. + seq_len (int): Number of frames in a sequence. Default: 1. + multiple_target (int): If larger than 0, merge every + ``multiple_target`` sequence together. Default: 0. + causal (bool): If set to ``True``, the rightmost input frame will be + the target frame. Otherwise, the middle input frame will be the + target frame. Default: ``True``. + subset_frac (float): The fraction to reduce dataset size. If set to 1, + the dataset size is not reduced. Default: 1. + camera_param_file (str): Cameras' parameters file. Default: ``None``. data_mode (str): Specifies the mode of data samples: ``'topdown'`` or ``'bottomup'``. In ``'topdown'`` mode, each data sample contains one instance; while in ``'bottomup'`` mode, each data sample @@ -62,31 +67,162 @@ class UBody3dDataset(BaseCocoStyleDataset): max_refetch (int, optional): If ``Basedataset.prepare_data`` get a None img. The maximum extra number of cycles to get a valid image. Default: 1000. - sample_interval (int, optional): The sample interval of the dataset. - Default: 1. """ + def __init__(self, + multiple_target: int = 0, + multiple_target_step: int = 0, + seq_step: int = 1, + pad_video_seq: bool = False, + **kwargs): + self.seq_step = seq_step + self.pad_video_seq = pad_video_seq + + if multiple_target > 0 and multiple_target_step == 0: + multiple_target_step = multiple_target + self.multiple_target_step = multiple_target_step + + super().__init__(multiple_target=multiple_target, **kwargs) + METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py') - def _load_annotations(self): - """Load data from annotations in COCO format.""" + def _load_ann_file(self, ann_file: str) -> dict: + """Load annotation file.""" + with get_local_path(ann_file) as local_path: + self.ann_data = COCO(local_path) + + def get_sequence_indices(self) -> List[List[int]]: + video_frames = defaultdict(list) + img_ids = self.ann_data.getImgIds() + for img_id in img_ids: + img_info = self.ann_data.loadImgs(img_id)[0] + subj, _, _ = self._parse_image_name(img_info['file_name']) + video_frames[subj].append(img_id) + + sequence_indices = [] + _len = (self.seq_len - 1) * self.seq_step + 1 + _step = self.seq_step - assert exists(self.ann_file), ( - f'Annotation file `{self.ann_file}`does not exist') + if self.multiple_target: + for _, _img_ids in sorted(video_frames.items()): + n_frame = len(_img_ids) + _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids) + seqs_from_video = [ + _ann_ids[i:(i + self.multiple_target):_step] + for i in range(0, n_frame, self.multiple_target_step) + ][:(n_frame + self.multiple_target_step - + self.multiple_target) // self.multiple_target_step] + sequence_indices.extend(seqs_from_video) + else: + for _, _img_ids in sorted(video_frames.items()): + n_frame = len(_img_ids) + _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids) + if self.pad_video_seq: + # Pad the sequence so that every frame in the sequence will + # be predicted. + if self.causal: + frames_left = self.seq_len - 1 + frames_right = 0 + else: + frames_left = (self.seq_len - 1) // 2 + frames_right = frames_left + for i in range(n_frame): + pad_left = max(0, frames_left - i // _step) + pad_right = max( + 0, frames_right - (n_frame - 1 - i) // _step) + start = max(i % _step, i - frames_left * _step) + end = min(n_frame - (n_frame - 1 - i) % _step, + i + frames_right * _step + 1) + sequence_indices.append([_ann_ids[0]] * pad_left + + _ann_ids[start:end:_step] + + [_ann_ids[-1]] * pad_right) + else: + seqs_from_video = [ + _ann_ids[i:(i + _len):_step] + for i in range(0, n_frame - _len + 1, _step) + ] + sequence_indices.extend(seqs_from_video) + + # reduce dataset size if needed + subset_size = int(len(sequence_indices) * self.subset_frac) + start = np.random.randint(0, len(sequence_indices) - subset_size + 1) + end = start + subset_size + + sequence_indices = sequence_indices[start:end] + + return sequence_indices - with get_local_path(self.ann_file) as local_path: - self.coco = COCO(local_path) - # set the metainfo about categories, which is a list of dict - # and each dict contains the 'id', 'name', etc. about this category - self._metainfo['CLASSES'] = self.coco.loadCats(self.coco.getCatIds()) + def _parse_image_name(self, image_path: str) -> Tuple[str, int]: + """Parse image name to get video name and frame index. + + Args: + image_name (str): Image name. + + Returns: + tuple[str, int]: Video name and frame index. + """ + trim, file_name = image_path.split('/')[2:] + frame_id, suffix = file_name.split('.') + return trim, frame_id, suffix + + def _load_annotations(self): + """Load data from annotations in COCO format.""" + num_keypoints = self.metainfo['num_keypoints'] + self._metainfo['CLASSES'] = self.ann_data.loadCats( + self.ann_data.getCatIds()) instance_list = [] image_list = [] - for img_id in self.coco.getImgIds(): - if img_id % self.sample_interval != 0: - continue - img = self.coco.loadImgs(img_id)[0] + for i, _ann_ids in enumerate(self.sequence_indices): + expected_num_frames = self.seq_len + if self.multiple_target: + expected_num_frames = self.multiple_target + + assert len(_ann_ids) == (expected_num_frames), ( + f'Expected `frame_ids` == {expected_num_frames}, but ' + f'got {len(_ann_ids)} ') + + anns = self.ann_data.loadAnns(_ann_ids) + imgs = self.ann_data.loadImgs([ann['image_id'] for ann in anns]) + + _kpts = np.array([ann['keypoints'] for ann in anns], + dtype=np.float32) + _kpts_3d = np.array([ann['keypoints_3d'] for ann in anns], + dtype=np.float32) + _keypoints_visible = np.array( + [ann['keypoints_valid'] for ann in anns], dtype=np.float32) + _camera_params = np.array([ann['camera_param'] for ann in anns]) + _scales = np.zeros(len(imgs), dtype=np.float32) + _centers = np.zeros((len(imgs), 2), dtype=np.float32) + _img_paths = np.array([img['file_name'] for img in imgs]) + + target_idx = [-1] if self.causal else [int(self.seq_len // 2)] + if self.multiple_target: + target_idx = list(range(self.multiple_target)) + + instance_info = { + 'num_keypoints': num_keypoints, + 'keypoints': _kpts, + 'keypoints_3d': _kpts_3d, + 'keypoints_visible': _keypoints_visible, + 'scale': _scales, + 'center': _centers, + 'id': i, + 'category_id': 1, + 'iscrowd': 0, + 'img_paths': list(_img_paths), + 'img_ids': [img['id'] for img in imgs], + 'lifting_target': _kpts_3d[target_idx], + 'lifting_target_visible': _keypoints_visible[target_idx], + 'target_img_paths': _img_paths[target_idx], + 'camera_param': _camera_params, + } + + instance_list.append(instance_info) + + for img_id in self.ann_data.getImgIds(): + img = self.ann_data.loadImgs(img_id)[0] img.update({ 'img_id': img_id, @@ -95,12 +231,6 @@ def _load_annotations(self): }) image_list.append(img) - ann_ids = self.coco.getAnnIds(imgIds=img_id) - for ann in self.coco.loadAnns(ann_ids): - if instance_info := self.parse_data_info( - dict(raw_ann_info=ann, raw_img_info=img)): - instance_list.append(instance_info) - return instance_list, image_list def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index a3af141c4d..5b96ee3983 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -359,66 +359,10 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, keypoints_3d = joint_img.copy() keypoints_valid = coord_valid.reshape((-1, 1)) - # map to COCO keypoint order - coco_keypoint_names = [ - 'Nose', 'L_Eye', 'R_Eye', 'L_Ear', 'R_Ear', 'L_Shoulder', - 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hip', - 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle' - ] - coco_keypoint_idx = [ - human_model.joints_name.index(name) for name in coco_keypoint_names - ] - coco_3d_keypoints = keypoints_3d[coco_keypoint_idx, :] - coco_2d_keypoints = keypoints_2d[coco_keypoint_idx, :] - coco_keypoint_valid = keypoints_valid[coco_keypoint_idx, :] - - # map to H36M keypoint order - h36m_keypoint_names = [ - 'Pelvis', 'R_Hip', 'R_Knee', 'R_Ankle', 'L_Hip', 'L_Knee', - 'L_Ankle', 'Neck', 'L_Shoulder', 'L_Elbow', 'L_Wrist', - 'R_Shoulder', 'R_Elbow', 'R_Wrist' - ] - # interplate 'spine' and 'neck_base' - pelvis_id = human_model.joints_name.index('Pelvis') - neck_id = human_model.joints_name.index('Neck') - nose_id = human_model.joints_name.index('Nose') - spine_3d = (keypoints_3d[pelvis_id, :] + keypoints_3d[neck_id, :]) / 2 - neck_base_3d = (keypoints_3d[neck_id, :] + - keypoints_3d[nose_id, :]) / 2 - spine_valid = keypoints_valid[pelvis_id] * keypoints_valid[neck_id] - neck_base_valid = keypoints_valid[neck_id] * keypoints_valid[nose_id] - - h36m_keypoint_idx = [ - human_model.joints_name.index(name) for name in h36m_keypoint_names - ] - h36m_3d_keypoints = keypoints_3d[h36m_keypoint_idx, :] - h36m_2d_keypoints = keypoints_2d[h36m_keypoint_idx, :] - h36m_3d_keypoints = np.concatenate([ - h36m_3d_keypoints[:7, :], spine_3d, h36m_3d_keypoints[7, :], - neck_base_3d, h36m_3d_keypoints[8:] - ], - axis=0) - h36m_2d_keypoints = np.concatenate([ - h36m_2d_keypoints[:7, :], spine_3d, h36m_2d_keypoints[7, :], - neck_base_3d, h36m_3d_keypoints[8:] - ], - axis=0) - h36m_keypoint_valid = keypoints_valid[h36m_keypoint_idx] - h36m_keypoint_valid = np.concatenate([ - h36m_keypoint_valid[:7], spine_valid, h36m_keypoint_valid[7], - neck_base_valid, h36m_keypoint_valid[8:] - ], - axis=0) - ann['keypoints'] = keypoints_2d.tolist() ann['keypoints_3d'] = keypoints_3d.tolist() ann['keypoints_valid'] = keypoints_valid.tolist() - ann['coco_keypoints'] = coco_2d_keypoints.tolist() - ann['coco_keypoints_3d'] = coco_3d_keypoints.tolist() - ann['coco_keypoints_valid'] = coco_keypoint_valid.tolist() - ann['h36m_keypoints'] = h36m_2d_keypoints.tolist() - ann['h36m_keypoints_3d'] = h36m_3d_keypoints.tolist() - ann['h36m_keypoints_valid'] = h36m_keypoint_valid.tolist() + ann['camera_param'] = cam_param img['file_name'] = os.path.join(scene, file_name) if video_name in splits: val_annos.append(ann) @@ -466,6 +410,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, annotation_path = f'{args.data_root}/annotations' folders = os.listdir(annotation_path) + folders = [f for f in folders if osp.isdir(osp.join(annotation_path, f))] human_model_path = args.human_model_path splits = np.load(split_path) From fcca1545c92cf845072b1e1fe5494641425159eb Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 11:30:51 +0800 Subject: [PATCH 12/21] minor change --- tools/dataset_converters/ubody_smplx_to_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/dataset_converters/ubody_smplx_to_coco.py b/tools/dataset_converters/ubody_smplx_to_coco.py index 5b96ee3983..16f827fce1 100644 --- a/tools/dataset_converters/ubody_smplx_to_coco.py +++ b/tools/dataset_converters/ubody_smplx_to_coco.py @@ -244,7 +244,7 @@ def process_scene_anno(scene: str, annotation_root: str, splits: np.array, if not os.path.exists(img_path): progress_bar.update() continue - if aid not in annos: + if str(aid) not in annos: progress_bar.update() continue From 1edad2c4b36d2e247357a6ff5e5ac93c25397d62 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 15:39:03 +0800 Subject: [PATCH 13/21] --other=fix transform --- ...ose-lift_motionbert-ft_8xb32-120e_ubody.py | 20 ++- .../datasets/body3d/ubody3d_dataset.py | 134 +++++------------- mmpose/datasets/transforms/converting.py | 26 ++++ 3 files changed, 71 insertions(+), 109 deletions(-) diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py index 900922b87c..fbf27004ba 100644 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py +++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py @@ -91,9 +91,11 @@ ] scenes = [ - 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', - 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', - 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' + 'Magic_show', + 'Entertainment', + # 'ConductMusic', 'Online_class', 'TalkShow', + # 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + # 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' ] train_datasets = [] @@ -108,10 +110,7 @@ multiple_target=120, multiple_target_step=60, data_prefix=dict(img='images/'), - pipeline=[ - dict( - type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m) - ], + pipeline=[], ) if scene in ['Speech', 'Movie']: continue @@ -123,10 +122,7 @@ multiple_target=243, data_root=data_root, data_prefix=dict(img='images/'), - pipeline=[ - dict( - type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m) - ], + pipeline=[], test_mode=True, ) train_datasets.append(train_dataset) @@ -134,6 +130,7 @@ # pipelines train_pipeline = [ + dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m), dict(type='GenerateTarget', encoder=train_codec), dict( type='RandomFlipAroundRoot', @@ -146,6 +143,7 @@ 'factor', 'camera_param')) ] val_pipeline = [ + dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m), dict(type='GenerateTarget', encoder=val_codec), dict( type='PackPoseInputs', diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py index 236bc380ad..903f639c6e 100644 --- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp from collections import defaultdict -from typing import List, Optional, Tuple +from typing import List, Tuple import numpy as np from mmengine.fileio import get_local_path @@ -161,7 +161,7 @@ def _parse_image_name(self, image_path: str) -> Tuple[str, int]: Returns: tuple[str, int]: Video name and frame index. """ - trim, file_name = image_path.split('/')[2:] + trim, file_name = image_path.split('/')[-2:] frame_id, suffix = file_name.split('.') return trim, frame_id, suffix @@ -184,39 +184,51 @@ def _load_annotations(self): f'got {len(_ann_ids)} ') anns = self.ann_data.loadAnns(_ann_ids) - imgs = self.ann_data.loadImgs([ann['image_id'] for ann in anns]) - - _kpts = np.array([ann['keypoints'] for ann in anns], - dtype=np.float32) - _kpts_3d = np.array([ann['keypoints_3d'] for ann in anns], - dtype=np.float32) - _keypoints_visible = np.array( - [ann['keypoints_valid'] for ann in anns], dtype=np.float32) - _camera_params = np.array([ann['camera_param'] for ann in anns]) - _scales = np.zeros(len(imgs), dtype=np.float32) - _centers = np.zeros((len(imgs), 2), dtype=np.float32) - _img_paths = np.array([img['file_name'] for img in imgs]) + img_ids = [] + kpts = np.zeros((len(anns), num_keypoints, 2), dtype=np.float32) + kpts_3d = np.zeros((len(anns), num_keypoints, 3), dtype=np.float32) + keypoints_visible = np.zeros((len(anns), num_keypoints, 1), + dtype=np.float32) + for j, ann in enumerate(anns): + img_ids.append(ann['image_id']) + kpts[j] = np.array(ann['keypoints'], dtype=np.float32) + kpts_3d[j] = np.array(ann['keypoints_3d'], dtype=np.float32) + keypoints_visible[j] = np.array( + ann['keypoints_valid'], dtype=np.float32) + imgs = self.ann_data.loadImgs(img_ids) + keypoints_visible = keypoints_visible.squeeze(-1) + + scales = np.zeros(len(imgs), dtype=np.float32) + centers = np.zeros((len(imgs), 2), dtype=np.float32) + img_paths = np.array([img['file_name'] for img in imgs]) + factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32) target_idx = [-1] if self.causal else [int(self.seq_len // 2)] if self.multiple_target: target_idx = list(range(self.multiple_target)) + cam_param = anns[-1]['camera_param'] + if 'w' not in cam_param or 'h' not in cam_param: + cam_param['w'] = 1000 + cam_param['h'] = 1000 + instance_info = { 'num_keypoints': num_keypoints, - 'keypoints': _kpts, - 'keypoints_3d': _kpts_3d, - 'keypoints_visible': _keypoints_visible, - 'scale': _scales, - 'center': _centers, + 'keypoints': kpts, + 'keypoints_3d': kpts_3d, + 'keypoints_visible': keypoints_visible, + 'scale': scales, + 'center': centers, 'id': i, 'category_id': 1, 'iscrowd': 0, - 'img_paths': list(_img_paths), + 'img_paths': list(img_paths), 'img_ids': [img['id'] for img in imgs], - 'lifting_target': _kpts_3d[target_idx], - 'lifting_target_visible': _keypoints_visible[target_idx], - 'target_img_paths': _img_paths[target_idx], - 'camera_param': _camera_params, + 'lifting_target': kpts_3d[target_idx], + 'lifting_target_visible': keypoints_visible[target_idx], + 'target_img_paths': img_paths[target_idx], + 'camera_param': cam_param, + 'factor': factors } instance_list.append(instance_info) @@ -232,77 +244,3 @@ def _load_annotations(self): image_list.append(img) return instance_list, image_list - - def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: - """Parse raw COCO annotation of an instance. - - Args: - raw_data_info (dict): Raw data information loaded from - ``ann_file``. It should have following contents: - - - ``'raw_ann_info'``: Raw annotation of an instance - - ``'raw_img_info'``: Raw information of the image that - contains the instance - - Returns: - dict | None: Parsed instance annotation - """ - - ann = raw_data_info['raw_ann_info'] - if 'bbox' not in ann or 'keypoints_3d' not in ann: - return None - - img = raw_data_info['raw_img_info'] - img_w, img_h = img['width'], img['height'] - - # get bbox in shape [1, 4], formatted as xywh - x, y, w, h = ann['bbox'] - x1 = np.clip(x, 0, img_w - 1) - y1 = np.clip(y, 0, img_h - 1) - x2 = np.clip(x + w, 0, img_w - 1) - y2 = np.clip(y + h, 0, img_h - 1) - - bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4) - - # keypoints in shape [1, K, 2] and keypoints_visible in [1, K] - _keypoints = np.array( - ann['keypoints'], dtype=np.float32).reshape(1, -1, 3) - keypoints = _keypoints[..., :2] - keypoints_visible = np.minimum(1, _keypoints[..., 2]) - - _keypoints_3d = np.array( - ann['keypoints_3d'], dtype=np.float32).reshape(1, -1, 4) - keypoints_3d = _keypoints_3d[..., :3] - keypoints_3d_visible = keypoints_visible - - if 'num_keypoints' in ann: - num_keypoints = ann['num_keypoints'] - else: - num_keypoints = np.count_nonzero(keypoints.max(axis=2)) - - scale = ann.get('scale', 0.0) - center = ann.get('center', np.array([0.0, 0.0])) - - instance_info = { - 'num_keypoints': num_keypoints, - 'keypoints': keypoints, - 'keypoints_visible': keypoints_visible, - 'keypoints_3d': keypoints_3d, - 'keypoints_3d_visible': keypoints_3d_visible, - 'bbox': bbox, - 'bbox_score': np.ones(1, dtype=np.float32), - 'scale': scale, - 'center': center, - 'id': ann['id'], - 'category_id': 1, - 'iscrowd': ann.get('iscrowd', 0), - 'segmentation': ann.get('segmentation', None), - 'img_path': img['img_path'], - 'img_id': ann['image_id'], - 'lifting_target': keypoints_3d[[-1]], - 'lifting_target_visible': keypoints_3d_visible[[-1]], - 'target_img_path': img['img_path'], - } - if 'crowdIndex' in img: - instance_info['crowd_index'] = img['crowdIndex'] - return instance_info diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py index c8204ac7ef..3d201b87e2 100644 --- a/mmpose/datasets/transforms/converting.py +++ b/mmpose/datasets/transforms/converting.py @@ -93,6 +93,9 @@ def transform(self, results: dict) -> dict: # Initialize output arrays keypoints = np.zeros((num_instances, self.num_keypoints, 2)) keypoints_visible = np.zeros((num_instances, self.num_keypoints)) + keypoints_3d = None + if 'keypoints_3d' in results: + keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3)) # Create a mask to weight visibility loss keypoints_visible_weights = keypoints_visible.copy() @@ -108,6 +111,11 @@ def transform(self, results: dict) -> dict: 'keypoints_visible'][:, self.source_index] * \ results['keypoints_visible'][:, self.source_index2] + if 'keypoints_3d' in results: + keypoints_3d[:, self.target_index] = 0.5 * ( + results['keypoints_3d'][:, self.source_index] + + results['keypoints_3d'][:, self.source_index2]) + # Otherwise just copy from the source index else: keypoints[:, @@ -115,11 +123,29 @@ def transform(self, results: dict) -> dict: source_index] keypoints_visible[:, self.target_index] = results[ 'keypoints_visible'][:, self.source_index] + if 'keypoints_3d' in results: + keypoints_3d[:, self.target_index] = results[ + 'keypoints_3d'][:, self.source_index] # Update the results dict results['keypoints'] = keypoints results['keypoints_visible'] = np.stack( [keypoints_visible, keypoints_visible_weights], axis=2) + if 'keypoints_3d' in results: + results['keypoints_3d'] = keypoints_3d + + # Updatae flip pairs + if 'flip_indices' in results: + flip_indices = [] + for i in range(len(self.target_index)): + x1, x2 = self.source_index[i], self.source_index2[i] + if x1 == x2: + flip_id = results['flip_indices'][x1] + flip_id = flip_id if flip_id < self.num_keypoints else i + flip_indices.append(flip_id) + else: + flip_indices.append(i) + results['flip_indices'] = flip_indices return results def transform_sigmas(self, sigmas: Union[List, np.ndarray]): From eba2fa79aef42b9c2b7be050e1b67347fe68c327 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 16:48:24 +0800 Subject: [PATCH 14/21] --fix=fix converting --- .../pose-lift_motionbert-ft_8xb32-120e_ubody.py | 8 +++----- mmpose/datasets/datasets/body3d/ubody3d_dataset.py | 3 ++- mmpose/datasets/transforms/converting.py | 12 +++++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py index fbf27004ba..af5c9b85ab 100644 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py +++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py @@ -91,11 +91,9 @@ ] scenes = [ - 'Magic_show', - 'Entertainment', - # 'ConductMusic', 'Online_class', 'TalkShow', - # 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', - # 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' ] train_datasets = [] diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py index 903f639c6e..85b8d893e7 100644 --- a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py @@ -228,7 +228,8 @@ def _load_annotations(self): 'lifting_target_visible': keypoints_visible[target_idx], 'target_img_paths': img_paths[target_idx], 'camera_param': cam_param, - 'factor': factors + 'factor': factors, + 'target_idx': target_idx, } instance_list.append(instance_info) diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py index 3d201b87e2..457234c4ee 100644 --- a/mmpose/datasets/transforms/converting.py +++ b/mmpose/datasets/transforms/converting.py @@ -91,11 +91,14 @@ def transform(self, results: dict) -> dict: num_instances = results['keypoints'].shape[0] # Initialize output arrays - keypoints = np.zeros((num_instances, self.num_keypoints, 2)) - keypoints_visible = np.zeros((num_instances, self.num_keypoints)) + keypoints = np.zeros((num_instances, self.num_keypoints, 2), + dtype=np.float32) + keypoints_visible = np.zeros((num_instances, self.num_keypoints), + dtype=np.float32) keypoints_3d = None if 'keypoints_3d' in results: - keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3)) + keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3), + dtype=np.float32) # Create a mask to weight visibility loss keypoints_visible_weights = keypoints_visible.copy() @@ -133,6 +136,9 @@ def transform(self, results: dict) -> dict: [keypoints_visible, keypoints_visible_weights], axis=2) if 'keypoints_3d' in results: results['keypoints_3d'] = keypoints_3d + results['lifting_target'] = keypoints_3d[results['target_idx']] + results['lifting_target_visible'] = keypoints_visible[ + results['target_idx']] # Updatae flip pairs if 'flip_indices' in results: From 79108a607c3dd765023634cbd1bfbfaf33012d70 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 16:54:52 +0800 Subject: [PATCH 15/21] minor change --- .../pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py index af5c9b85ab..16546693c9 100644 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py +++ b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py @@ -117,7 +117,7 @@ ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json', seq_len=1, seq_step=1, - multiple_target=243, + multiple_target=120, data_root=data_root, data_prefix=dict(img='images/'), pipeline=[], From 3121d862b5c3009179e98bc16a074f2db1f210bd Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 19:47:27 +0800 Subject: [PATCH 16/21] --other=rm simplebaseline config --- ...-lift_simplebaseline3d_8xb64-200e_ubody.py | 130 ------------------ 1 file changed, 130 deletions(-) delete mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py deleted file mode 100644 index 751905b79e..0000000000 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_simplebaseline3d_8xb64-200e_ubody.py +++ /dev/null @@ -1,130 +0,0 @@ -_base_ = ['../../_base_/default_runtime.py'] - -vis_backends = [ - dict(type='LocalVisBackend'), -] -visualizer = dict( - type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -# runtime -train_cfg = dict(max_epochs=200, val_interval=10) - -# optimizer -optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) - -# learning policy -param_scheduler = [ - dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False) -] - -auto_scale_lr = dict(base_batch_size=512) - -# hooks -default_hooks = dict( - checkpoint=dict( - type='CheckpointHook', - save_best='MPJPE', - rule='less', - max_keep_ckpts=1)) - -# codec settings -codec = dict( - type='ImagePoseLifting', num_keypoints=137, root_index=0, remove_root=True) - -# model settings -model = dict( - type='PoseLifter', - backbone=dict( - type='TCN', - in_channels=2 * 137, - stem_channels=1024, - num_blocks=2, - kernel_sizes=(1, 1, 1), - dropout=0.5, - ), - head=dict( - type='TemporalRegressionHead', - in_channels=1024, - num_joints=136, - loss=dict(type='MSELoss'), - decoder=codec, - )) - -# base dataset settings -dataset_type = 'UBody3dDataset' -data_mode = 'topdown' -data_root = 'data/UBody/' - -scenes = [ - 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', - 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', - 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' -] - -train_datasets = [] -val_datasets = [] - -for scene in scenes: - train_dataset = dict( - type=dataset_type, - data_root=data_root, - data_mode=data_mode, - ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json', - seq_len=1, - causal=True, - keypoint_2d_src='gt', - data_prefix=dict(img='images/'), - pipeline=[]) - val_dataset = dict( - type=dataset_type, - data_root=data_root, - data_mode=data_mode, - ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json', - data_prefix=dict(img='images/'), - pipeline=[]) - train_datasets.append(train_dataset) - val_datasets.append(val_dataset) - -# pipelines -train_pipeline = [ - dict(type='GenerateTarget', encoder=codec), - dict( - type='PackPoseInputs', - meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', - 'target_root', 'target_root_index')) -] -val_pipeline = train_pipeline - -# data loaders -train_dataloader = dict( - batch_size=64, - num_workers=2, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), - datasets=train_datasets, - pipeline=train_pipeline, - test_mode=False, - )) -val_dataloader = dict( - batch_size=64, - num_workers=2, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), - datasets=val_datasets, - pipeline=val_pipeline, - test_mode=True, - )) -test_dataloader = val_dataloader - -# evaluators -val_evaluator = [ - dict(type='SimpleMPJPE', mode='mpjpe'), - dict(type='SimpleMPJPE', mode='p-mpjpe') -] -test_evaluator = val_evaluator From 84ae4cd34ffe6e05e2559619da5dcf2a0151d8b8 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 18 Sep 2023 20:23:41 +0800 Subject: [PATCH 17/21] --test=add ut --- tests/data/ubody3d/ubody3d_train.json | 1 + .../test_body_datasets/test_ubody_dataset.py | 77 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/data/ubody3d/ubody3d_train.json create mode 100644 tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py diff --git a/tests/data/ubody3d/ubody3d_train.json b/tests/data/ubody3d/ubody3d_train.json new file mode 100644 index 0000000000..55a4ac5226 --- /dev/null +++ b/tests/data/ubody3d/ubody3d_train.json @@ -0,0 +1 @@ +{"images": [{"id": 15, "height": 720, "width": 1280, "file_name": "Magic_show/Magic_show_S1_Trim1/Magic_show_S1_Trim1/000016.png"}], "annotations": [{"id": 0, "image_id": 15, "bbox": [74.55498504638672, 8.571063995361328, 1062.4967727661133, 701.8491630554199], "segmentation": [[]], "area": 0, "iscrowd": 0, "category_id": 1, "score": 1, "person_id": 0, "hand_box1": [336.4236145019531, 321.40362548828125, 473.6637268066406, 452.62567138671875], "hand_box2": [699.218994140625, 50.335018157958984, 533.58251953125, 621.6577186584473], "keypoints": [[585.656005859375, 1398.5216064453125], [699.9061889648438, 1586.966064453125], [450.14288330078125, 1596.144775390625], [878.3228149414062, 2171.27783203125], [252.16543579101562, 2132.398681640625], [793.895263671875, 2988.90771484375], [232.56475830078125, 2939.503173828125], [588.2872314453125, 570.474365234375], [862.1456298828125, 514.33837890625], [373.89849853515625, 519.60888671875], [1073.739990234375, 765.0070190429688], [89.8785400390625, 775.919921875], [1000.2418212890625, 635.8955688476562], [189.44015502929688, 567.993408203125], [891.81298828125, 2948.2041015625], [1013.4824829101562, 3015.250732421875], [819.24658203125, 3122.821533203125], [172.14041137695312, 2868.272705078125], [31.46063232421875, 2937.01025390625], [244.37692260742188, 3111.135009765625], [760.2764282226562, 235.35623168945312], [469.04644775390625, 237.359130859375], [672.689453125, 216.68638610839844], [536.8645629882812, 215.08010864257812], [594.4747924804688, 302.86590576171875], [937.543212890625, 563.2012939453125], [877.2040405273438, 564.7064819335938], [826.8228759765625, 548.8115234375], [768.3922729492188, 532.2924194335938], [945.0330810546875, 433.25579833984375], [887.2977905273438, 411.39129638671875], [854.9716796875, 409.1885986328125], [812.5216064453125, 409.8503112792969], [993.1986083984375, 415.13519287109375], [983.431640625, 352.09503173828125], [976.8125610351562, 306.58990478515625], [967.6991577148438, 251.8966064453125], [1042.6788330078125, 439.2115783691406], [1061.695068359375, 382.62310791015625], [1078.3428955078125, 336.8554382324219], [1089.8707275390625, 288.113037109375], [1077.3145751953125, 467.8497009277344], [1113.5694580078125, 449.51904296875], [1147.91796875, 434.2681884765625], [1184.372314453125, 406.7205505371094], [262.0787048339844, 512.4108276367188], [314.8291320800781, 495.84429931640625], [355.2375183105469, 463.73870849609375], [400.5841064453125, 429.6348876953125], [290.11627197265625, 385.6371765136719], [334.016357421875, 356.7796325683594], [352.326904296875, 347.6751403808594], [379.92449951171875, 336.6559143066406], [248.99337768554688, 355.2509460449219], [270.441162109375, 294.56085205078125], [283.58990478515625, 247.07943725585938], [298.6072692871094, 191.95077514648438], [194.588623046875, 364.1822509765625], [197.89288330078125, 304.9277038574219], [198.94699096679688, 255.0223846435547], [207.83172607421875, 206.8009490966797], [152.69793701171875, 380.91925048828125], [126.07894897460938, 349.861083984375], [99.02603149414062, 320.67138671875], [75.35498046875, 280.7127380371094], [605.5189819335938, 258.36474609375], [636.6569213867188, 261.03448486328125], [672.689453125, 216.68638610839844], [536.8645629882812, 215.08010864257812], [480.609130859375, 193.2221221923828], [498.7352294921875, 169.0961151123047], [527.0252075195312, 168.48736572265625], [556.564453125, 174.32501220703125], [582.2213134765625, 183.7449188232422], [619.771728515625, 185.09783935546875], [646.1015625, 177.27572631835938], [678.3016357421875, 172.73214721679688], [709.5665283203125, 174.52818298339844], [730.6221313476562, 199.52928161621094], [600.2632446289062, 215.79234313964844], [598.0828247070312, 240.45635986328125], [596.2218627929688, 264.4862976074219], [594.4674072265625, 287.62481689453125], [572.7188110351562, 305.8975830078125], [583.9725341796875, 311.3199157714844], [596.401123046875, 315.5985107421875], [609.6165771484375, 311.5094909667969], [622.2186279296875, 306.6711120605469], [512.6423950195312, 211.75982666015625], [528.5633544921875, 204.07089233398438], [548.4610595703125, 205.9830780029297], [565.9568481445312, 217.66900634765625], [548.8089599609375, 222.94613647460938], [530.2134399414062, 222.75762939453125], [639.6070556640625, 219.82444763183594], [655.8860473632812, 209.6044158935547], [676.3201904296875, 208.3985595703125], [694.9487915039062, 217.1615753173828], [674.3418579101562, 226.85595703125], [655.4156494140625, 225.6745147705078], [551.7490234375, 353.2354736328125], [564.1500244140625, 346.4883728027344], [583.2034912109375, 344.99609375], [595.4065551757812, 347.21868896484375], [607.8397216796875, 345.721435546875], [629.6182250976562, 348.2886047363281], [648.6402587890625, 353.0809631347656], [634.0433349609375, 361.12738037109375], [612.543212890625, 365.1044921875], [598.9017333984375, 366.5699768066406], [585.4385375976562, 366.0231018066406], [566.12353515625, 362.2437744140625], [553.4495239257812, 352.7164001464844], [583.9151000976562, 355.8670654296875], [596.3876342773438, 356.340576171875], [608.99560546875, 356.22100830078125], [648.081787109375, 352.85076904296875], [612.7412719726562, 351.5333251953125], [598.9871215820312, 351.8242492675781], [585.3312377929688, 352.4969482421875], [464.1539001464844, 202.29954528808594], [465.8164978027344, 244.8143768310547], [469.96026611328125, 282.73333740234375], [474.998779296875, 318.5062255859375], [485.900390625, 354.82257080078125], [503.9440002441406, 389.1557922363281], [533.9607543945312, 420.1808776855469], [569.1990356445312, 439.69488525390625], [604.7715454101562, 445.1242370605469], [641.609130859375, 438.5807189941406], [677.1731567382812, 419.1774597167969], [709.558349609375, 390.3476867675781], [728.9358520507812, 358.6229553222656], [743.6824951171875, 323.7010192871094], [752.355224609375, 286.009033203125], [756.031494140625, 248.0742645263672], [756.6275634765625, 206.8378448486328]], "foot_kpts": [1166.72314453125, 38.096336364746094, 0, 1002.4937744140625, 109.48077392578125, 0, 1049.140869140625, 663.1453857421875, 0, 317.3815002441406, 32.0361328125, 0, 402.523681640625, 303.2774963378906, 0, 177.21731567382812, 665.190673828125, 0], "face_kpts": [482.1813659667969, 206.51531982421875, 0, 474.4501037597656, 248.23251342773438, 1, 482.5657043457031, 282.5651550292969, 1, 490.3671569824219, 326.8166198730469, 1, 498.9546813964844, 355.2204895019531, 1, 519.25634765625, 390.5085754394531, 1, 543.9222412109375, 417.4048156738281, 1, 574.4150390625, 437.6228332519531, 1, 614.6944580078125, 442.5209045410156, 1, 648.99267578125, 436.2539978027344, 1, 682.6341552734375, 416.4512023925781, 1, 702.5023193359375, 392.0824279785156, 1, 725.9093017578125, 358.3260803222656, 1, 739.4346923828125, 328.9374084472656, 1, 746.7598876953125, 285.0207824707031, 1, 748.8603515625, 251.59585571289062, 1, 755.915771484375, 212.4534149169922, 0, 496.4743957519531, 188.47494506835938, 1, 514.8231201171875, 177.99856567382812, 1, 535.214111328125, 176.0469970703125, 1, 556.4619140625, 177.9375, 1, 576.8843994140625, 183.35317993164062, 1, 631.4595947265625, 183.65673828125, 1, 652.4815673828125, 180.27340698242188, 1, 676.221923828125, 180.07711791992188, 1, 698.4794921875, 184.41073608398438, 1, 718.5443115234375, 196.21084594726562, 1, 604.396484375, 218.71194458007812, 1, 602.6702880859375, 245.68115234375, 1, 600.9422607421875, 271.4402770996094, 1, 599.4947509765625, 297.5359802246094, 1, 571.33203125, 313.3100891113281, 1, 586.1724853515625, 317.1542663574219, 1, 601.4893798828125, 320.0868835449219, 1, 617.738525390625, 316.9916687011719, 1, 632.822509765625, 313.9440002441406, 1, 524.906005859375, 216.0177001953125, 1, 542.880859375, 206.15841674804688, 1, 563.9365234375, 208.03213500976562, 1, 578.5321044921875, 222.44454956054688, 1, 559.7491455078125, 226.11843872070312, 1, 541.22607421875, 225.11203002929688, 1, 636.491943359375, 223.62353515625, 1, 652.7271728515625, 210.68789672851562, 1, 674.761474609375, 209.86370849609375, 1, 692.972900390625, 221.53323364257812, 1, 674.9864501953125, 228.75543212890625, 1, 656.0750732421875, 229.04306030273438, 1, 560.0743408203125, 351.4398498535156, 1, 577.081787109375, 347.0306091308594, 1, 594.04638671875, 345.2702941894531, 1, 604.1793212890625, 346.1555480957031, 1, 614.151611328125, 344.8525695800781, 1, 634.447509765625, 345.7118225097656, 1, 656.1597900390625, 347.9260559082031, 1, 640.6773681640625, 358.7562561035156, 1, 624.00732421875, 366.7438049316406, 1, 605.445556640625, 369.8896789550781, 1, 588.646484375, 367.5843811035156, 1, 573.5023193359375, 360.9281921386719, 1, 565.385498046875, 352.2278137207031, 1, 585.1085205078125, 353.1212463378906, 1, 604.616943359375, 355.0426330566406, 1, 626.8272705078125, 351.8833312988281, 1, 650.2919921875, 349.2644958496094, 1, 627.5924072265625, 353.0104675292969, 1, 604.7803955078125, 355.8074645996094, 1, 584.6986083984375, 354.2829284667969, 1], "lefthand_kpts": [942.7679443359375, 607.469482421875, 1, 888.291259765625, 539.277587890625, 1, 832.873291015625, 483.5708923339844, 1, 787.126953125, 436.6972351074219, 1, 710.735107421875, 413.7229309082031, 1, 888.9903564453125, 319.5710754394531, 1, 868.0140380859375, 280.7148742675781, 1, 830.3096923828125, 266.0387268066406, 1, 778.9337158203125, 271.2351379394531, 1, 962.7294921875, 272.7072448730469, 1, 955.781005859375, 187.65567016601562, 1, 953.9222412109375, 103.62838745117188, 1, 959.151611328125, 29.267608642578125, 1, 1047.009033203125, 294.3193664550781, 1, 1056.5989990234375, 215.84146118164062, 1, 1066.36865234375, 147.68014526367188, 1, 1081.0699462890625, 65.11972045898438, 1, 1107.0172119140625, 358.7002258300781, 1, 1159.4434814453125, 319.2156677246094, 1, 1206.9718017578125, 272.8797912597656, 1, 1261.1082763671875, 224.43637084960938, 1], "righthand_kpts": [233.142822265625, 582.3209228515625, 1, 300.6414794921875, 508.47479248046875, 1, 362.43896484375, 455.85186767578125, 1, 377.3603515625, 404.19744873046875, 1, 446.76416015625, 377.29241943359375, 1, 342.8802490234375, 310.6497802734375, 1, 368.6904296875, 284.673095703125, 1, 381.802734375, 251.73486328125, 1, 421.5467529296875, 225.363525390625, 1, 283.64288330078125, 254.122802734375, 1, 304.9996337890625, 170.8004150390625, 1, 320.6651611328125, 98.6851806640625, 1, 335.6553955078125, 28.2318115234375, 1, 199.05755615234375, 256.80859375, 1, 206.0360107421875, 177.01025390625, 1, 215.68804931640625, 106.7457275390625, 1, 224.53521728515625, 32.276611328125, 1, 128.827392578125, 294.99359130859375, 1, 99.0606689453125, 239.12982177734375, 1, 65.53125, 189.2431640625, 1, 37.63360595703125, 116.657958984375, 1], "center": [605.8033447265625, 359.4956359863281], "scale": [6.6406049728393555, 8.854140281677246], "keypoints_score": [0.9791078567504883, 0.9932481050491333, 1.0011144876480103, 0.973096489906311, 0.972457766532898, 0.866172194480896, 0.8760361671447754, 0.3526427149772644, 0.3903506398200989, 0.921836793422699, 0.9433825016021729, 0.20496317744255066, 0.2460474669933319, 0.20729553699493408, 0.17142903804779053, 0.18208564817905426, 0.22269707918167114], "face_kpts_score": [0.3680439293384552, 0.5355573892593384, 0.6418813467025757, 0.6644495725631714, 0.7590401768684387, 0.5538617372512817, 0.5907169580459595, 0.5878690481185913, 0.6348617076873779, 0.7361799478530884, 0.6556291580200195, 0.618322491645813, 0.6537319421768188, 0.5892513394355774, 0.7059171199798584, 0.645734429359436, 0.4574907422065735, 0.9639992713928223, 0.9263820648193359, 0.8876979351043701, 0.9284569621086121, 0.9739065170288086, 0.9502178430557251, 0.9174821376800537, 0.918608546257019, 0.9061530232429504, 0.862210750579834, 0.9776759147644043, 0.973875105381012, 0.974762499332428, 0.9565852880477905, 0.9716235399246216, 1.0059518814086914, 0.946382999420166, 0.9594531059265137, 0.9658107757568359, 1.0158061981201172, 0.9708306789398193, 0.9969902634620667, 0.9845597743988037, 0.9349627494812012, 0.9380444288253784, 0.9717998504638672, 0.9871775507926941, 0.9774664640426636, 0.9537898898124695, 0.9465979933738708, 0.9661000967025757, 0.9713011980056763, 0.9717509746551514, 0.956028938293457, 1.000832438468933, 0.9808722734451294, 0.9960898160934448, 0.9364079236984253, 1.0011546611785889, 0.9167187213897705, 0.9541155099868774, 0.9244742393493652, 0.988551139831543, 0.9954862594604492, 0.9832127094268799, 0.978826642036438, 0.9751479625701904, 0.956895112991333, 0.9974040985107422, 0.9864891767501831, 0.9898920655250549], "foot_kpts_score": [0.24755269289016724, 0.1599443256855011, 0.25949808955192566, 0.2688680589199066, 0.14811083674430847, 0.23364056646823883], "lefthand_kpts_score": [0.603957986831665, 0.46176729202270506, 0.5001004695892334, 0.6286116600036621, 0.7983541250228882, 0.7467568874359131, 0.7094749569892883, 0.7889106035232544, 0.8908322811126709, 0.8638974189758301, 1.0441084861755372, 0.9282500505447387, 0.9102095127105713, 0.7738837957382202, 0.94963458776474, 0.8981462478637695, 0.9926700949668884, 0.7828058958053589, 0.9498528003692627, 0.9387582302093506, 0.8471795082092285], "righthand_kpts_score": [0.6722876787185669, 0.60037282705307, 0.5398626983165741, 0.7077780723571777, 0.7050052642822265, 0.6411999225616455, 0.725990629196167, 0.758279001712799, 0.8829087972640991, 0.889958119392395, 0.9569337129592895, 0.9145335912704468, 0.9213766813278198, 0.8925279140472412, 0.9955486416816711, 1.0033048152923585, 1.0014301896095277, 0.9033888339996338, 0.9002806305885315, 0.8902452945709228, 0.888652241230011], "face_box": [445.3220458984375, 145.05938720703125, 348.63178710937495, 332.0302734375], "face_valid": true, "leftfoot_valid": false, "rightfoot_valid": false, "lefthand_valid": true, "righthand_valid": true, "lefthand_box": [699.218994140625, 50.335018157958984, 533.58251953125, 621.6577186584473], "righthand_box": [81.47227172851564, -7.12115478515625, 398.4362548828125, 664.060546875], "lefthand_update": true, "righthand_update": true, "lefthand_kpts_vitposehand": [942.7679443359375, 607.469482421875, 1, 888.291259765625, 539.277587890625, 1, 832.873291015625, 483.5708923339844, 1, 787.126953125, 436.6972351074219, 1, 710.735107421875, 413.7229309082031, 1, 888.9903564453125, 319.5710754394531, 1, 868.0140380859375, 280.7148742675781, 1, 830.3096923828125, 266.0387268066406, 1, 778.9337158203125, 271.2351379394531, 1, 962.7294921875, 272.7072448730469, 1, 955.781005859375, 187.65567016601562, 1, 953.9222412109375, 103.62838745117188, 1, 959.151611328125, 29.267608642578125, 1, 1047.009033203125, 294.3193664550781, 1, 1056.5989990234375, 215.84146118164062, 1, 1066.36865234375, 147.68014526367188, 1, 1081.0699462890625, 65.11972045898438, 1, 1107.0172119140625, 358.7002258300781, 1, 1159.4434814453125, 319.2156677246094, 1, 1206.9718017578125, 272.8797912597656, 1, 1261.1082763671875, 224.43637084960938, 1], "righthand_kpts_vitposehand": [233.142822265625, 582.3209228515625, 1, 300.6414794921875, 508.47479248046875, 1, 362.43896484375, 455.85186767578125, 1, 377.3603515625, 404.19744873046875, 1, 446.76416015625, 377.29241943359375, 1, 342.8802490234375, 310.6497802734375, 1, 368.6904296875, 284.673095703125, 1, 381.802734375, 251.73486328125, 1, 421.5467529296875, 225.363525390625, 1, 283.64288330078125, 254.122802734375, 1, 304.9996337890625, 170.8004150390625, 1, 320.6651611328125, 98.6851806640625, 1, 335.6553955078125, 28.2318115234375, 1, 199.05755615234375, 256.80859375, 1, 206.0360107421875, 177.01025390625, 1, 215.68804931640625, 106.7457275390625, 1, 224.53521728515625, 32.276611328125, 1, 128.827392578125, 294.99359130859375, 1, 99.0606689453125, 239.12982177734375, 1, 65.53125, 189.2431640625, 1, 37.63360595703125, 116.657958984375, 1], "num_keypoints": 9, "full_body": false, "valid_label": 2, "keypoints_3d": [[585.656005859375, 1398.5216064453125, 8.0], [699.9061889648438, 1586.966064453125, 7.7132415771484375], [450.14288330078125, 1596.144775390625, 7.6570892333984375], [878.3228149414062, 2171.27783203125, 5.664215087890625], [252.16543579101562, 2132.398681640625, 5.6501007080078125], [793.895263671875, 2988.90771484375, 4.6084747314453125], [232.56475830078125, 2939.503173828125, 4.28839111328125], [588.2872314453125, 570.474365234375, 9.544265747070312], [862.1456298828125, 514.33837890625, 8.8726806640625], [373.89849853515625, 519.60888671875, 9.171127319335938], [1073.739990234375, 765.0070190429688, 7.1384735107421875], [89.8785400390625, 775.919921875, 7.5379791259765625], [1000.2418212890625, 635.8955688476562, 5.19927978515625], [189.44015502929688, 567.993408203125, 5.757049560546875], [891.81298828125, 2948.2041015625, 3.0384368896484375], [1013.4824829101562, 3015.250732421875, 3.43035888671875], [819.24658203125, 3122.821533203125, 4.943603515625], [172.14041137695312, 2868.272705078125, 2.809112548828125], [31.46063232421875, 2937.01025390625, 3.1867828369140625], [244.37692260742188, 3111.135009765625, 4.5428619384765625], [760.2764282226562, 235.35623168945312, 9.170547485351562], [469.04644775390625, 237.359130859375, 9.270904541015625], [672.689453125, 216.68638610839844, 8.436477661132812], [536.8645629882812, 215.08010864257812, 8.477508544921875], [594.4747924804688, 302.86590576171875, 8.231826782226562], [937.543212890625, 563.2012939453125, 7.81884765625], [877.2040405273438, 564.7064819335938, 7.746490478515625], [826.8228759765625, 548.8115234375, 7.6898651123046875], [768.3922729492188, 532.2924194335938, 7.540069580078125], [945.0330810546875, 433.25579833984375, 7.78143310546875], [887.2977905273438, 411.39129638671875, 7.68023681640625], [854.9716796875, 409.1885986328125, 7.548248291015625], [812.5216064453125, 409.8503112792969, 7.41748046875], [993.1986083984375, 415.13519287109375, 7.762298583984375], [983.431640625, 352.09503173828125, 7.7212677001953125], [976.8125610351562, 306.58990478515625, 7.644317626953125], [967.6991577148438, 251.8966064453125, 7.58074951171875], [1042.6788330078125, 439.2115783691406, 7.7346954345703125], [1061.695068359375, 382.62310791015625, 7.7144622802734375], [1078.3428955078125, 336.8554382324219, 7.6671142578125], [1089.8707275390625, 288.113037109375, 7.64324951171875], [1077.3145751953125, 467.8497009277344, 7.6988525390625], [1113.5694580078125, 449.51904296875, 7.6714019775390625], [1147.91796875, 434.2681884765625, 7.6133880615234375], [1184.372314453125, 406.7205505371094, 7.566802978515625], [262.0787048339844, 512.4108276367188, 7.7939453125], [314.8291320800781, 495.84429931640625, 7.6787109375], [355.2375183105469, 463.73870849609375, 7.6097564697265625], [400.5841064453125, 429.6348876953125, 7.4446563720703125], [290.11627197265625, 385.6371765136719, 7.82208251953125], [334.016357421875, 356.7796325683594, 7.663116455078125], [352.326904296875, 347.6751403808594, 7.499725341796875], [379.92449951171875, 336.6559143066406, 7.330535888671875], [248.99337768554688, 355.2509460449219, 7.84161376953125], [270.441162109375, 294.56085205078125, 7.848602294921875], [283.58990478515625, 247.07943725585938, 7.8173370361328125], [298.6072692871094, 191.95077514648438, 7.8151092529296875], [194.588623046875, 364.1822509765625, 7.8341217041015625], [197.89288330078125, 304.9277038574219, 7.8556976318359375], [198.94699096679688, 255.0223846435547, 7.8529815673828125], [207.83172607421875, 206.8009490966797, 7.8715667724609375], [152.69793701171875, 380.91925048828125, 7.8072052001953125], [126.07894897460938, 349.861083984375, 7.8142547607421875], [99.02603149414062, 320.67138671875, 7.79296875], [75.35498046875, 280.7127380371094, 7.79833984375], [605.5189819335938, 258.36474609375, 7.6539459228515625], [636.6569213867188, 261.03448486328125, 7.6003265380859375], [672.689453125, 216.68638610839844, 6.8922119140625], [536.8645629882812, 215.08010864257812, 6.9332427978515625], [480.609130859375, 193.2221221923828, 7.156890869140625], [498.7352294921875, 169.0961151123047, 7.0008087158203125], [527.0252075195312, 168.48736572265625, 6.879364013671875], [556.564453125, 174.32501220703125, 6.8116912841796875], [582.2213134765625, 183.7449188232422, 6.796417236328125], [619.771728515625, 185.09783935546875, 6.7884368896484375], [646.1015625, 177.27572631835938, 6.788299560546875], [678.3016357421875, 172.73214721679688, 6.8334197998046875], [709.5665283203125, 174.52818298339844, 6.94036865234375], [730.6221313476562, 199.52928161621094, 7.08001708984375], [600.2632446289062, 215.79234313964844, 6.797698974609375], [598.0828247070312, 240.45635986328125, 6.753753662109375], [596.2218627929688, 264.4862976074219, 6.70782470703125], [594.4674072265625, 287.62481689453125, 6.66571044921875], [572.7188110351562, 305.8975830078125, 6.8535308837890625], [583.9725341796875, 311.3199157714844, 6.8229217529296875], [596.401123046875, 315.5985107421875, 6.804962158203125], [609.6165771484375, 311.5094909667969, 6.8159027099609375], [622.2186279296875, 306.6711120605469, 6.8405303955078125], [512.6423950195312, 211.75982666015625, 7.02471923828125], [528.5633544921875, 204.07089233398438, 6.9400634765625], [548.4610595703125, 205.9830780029297, 6.92816162109375], [565.9568481445312, 217.66900634765625, 6.9529266357421875], [548.8089599609375, 222.94613647460938, 6.9491424560546875], [530.2134399414062, 222.75762939453125, 6.9624176025390625], [639.6070556640625, 219.82444763183594, 6.930755615234375], [655.8860473632812, 209.6044158935547, 6.8970184326171875], [676.3201904296875, 208.3985595703125, 6.8957061767578125], [694.9487915039062, 217.1615753173828, 6.9696502685546875], [674.3418579101562, 226.85595703125, 6.9189300537109375], [655.4156494140625, 225.6745147705078, 6.91705322265625], [551.7490234375, 353.2354736328125, 6.971923828125], [564.1500244140625, 346.4883728027344, 6.88177490234375], [583.2034912109375, 344.99609375, 6.8333587646484375], [595.4065551757812, 347.21868896484375, 6.8253173828125], [607.8397216796875, 345.721435546875, 6.82666015625], [629.6182250976562, 348.2886047363281, 6.8668060302734375], [648.6402587890625, 353.0809631347656, 6.940582275390625], [634.0433349609375, 361.12738037109375, 6.8939056396484375], [612.543212890625, 365.1044921875, 6.8557891845703125], [598.9017333984375, 366.5699768066406, 6.8533477783203125], [585.4385375976562, 366.0231018066406, 6.8624725341796875], [566.12353515625, 362.2437744140625, 6.9132232666015625], [553.4495239257812, 352.7164001464844, 6.97503662109375], [583.9151000976562, 355.8670654296875, 6.8811187744140625], [596.3876342773438, 356.340576171875, 6.8712615966796875], [608.99560546875, 356.22100830078125, 6.8746795654296875], [648.081787109375, 352.85076904296875, 6.94110107421875], [612.7412719726562, 351.5333251953125, 6.865570068359375], [598.9871215820312, 351.8242492675781, 6.8616485595703125], [585.3312377929688, 352.4969482421875, 6.87408447265625], [464.1539001464844, 202.29954528808594, 7.4058380126953125], [465.8164978027344, 244.8143768310547, 7.313018798828125], [469.96026611328125, 282.73333740234375, 7.331451416015625], [474.998779296875, 318.5062255859375, 7.377685546875], [485.900390625, 354.82257080078125, 7.34814453125], [503.9440002441406, 389.1557922363281, 7.29644775390625], [533.9607543945312, 420.1808776855469, 7.2111968994140625], [569.1990356445312, 439.69488525390625, 7.0761260986328125], [604.7715454101562, 445.1242370605469, 7.0256805419921875], [641.609130859375, 438.5807189941406, 7.05670166015625], [677.1731567382812, 419.1774597167969, 7.1628265380859375], [709.558349609375, 390.3476867675781, 7.262908935546875], [728.9358520507812, 358.6229553222656, 7.3195648193359375], [743.6824951171875, 323.7010192871094, 7.3823699951171875], [752.355224609375, 286.009033203125, 7.3757171630859375], [756.031494140625, 248.0742645263672, 7.3575439453125], [756.6275634765625, 206.8378448486328, 7.39019775390625]], "keypoints_valid": [[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0]], "camera_param": {"focal": [34553.93155415853, 34553.93075942993], "princpt": [605.3033752441406, 358.99560546875]}}], "categories": [{"supercategory": "person", "id": 1, "name": "person"}]} \ No newline at end of file diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py new file mode 100644 index 0000000000..12f780e1a0 --- /dev/null +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_ubody_dataset.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.datasets.datasets.body3d import UBody3dDataset + + +class TestUBody3dDataset(TestCase): + + def build_ubody3d_dataset(self, **kwargs): + + cfg = dict( + ann_file='ubody3d_train.json', + data_mode='topdown', + data_root='tests/data/ubody3d', + pipeline=[], + test_mode=False) + + cfg.update(kwargs) + return UBody3dDataset(**cfg) + + def check_data_info_keys(self, data_info: dict): + expected_keys = dict( + img_paths=list, + keypoints=np.ndarray, + keypoints_3d=np.ndarray, + scale=np.ndarray, + center=np.ndarray, + id=int) + + for key, type_ in expected_keys.items(): + self.assertIn(key, data_info) + self.assertIsInstance(data_info[key], type_, key) + + def test_metainfo(self): + dataset = self.build_ubody3d_dataset() + # test dataset_name + self.assertEqual(dataset.metainfo['dataset_name'], 'ubody3d') + + # test number of keypoints + num_keypoints = 137 + self.assertEqual(dataset.metainfo['num_keypoints'], num_keypoints) + self.assertEqual( + len(dataset.metainfo['keypoint_colors']), num_keypoints) + self.assertEqual( + len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) + + # test some extra metainfo + self.assertEqual( + len(dataset.metainfo['skeleton_links']), + len(dataset.metainfo['skeleton_link_colors'])) + + def test_topdown(self): + # test topdown training + dataset = self.build_ubody3d_dataset(data_mode='topdown') + dataset.full_init() + self.assertEqual(len(dataset), 1) + self.check_data_info_keys(dataset[0]) + + # test topdown testing + dataset = self.build_ubody3d_dataset( + data_mode='topdown', test_mode=True) + dataset.full_init() + self.assertEqual(len(dataset), 1) + self.check_data_info_keys(dataset[0]) + + # test topdown training with sequence config + dataset = self.build_ubody3d_dataset( + data_mode='topdown', + seq_len=1, + seq_step=1, + causal=False, + pad_video_seq=True) + dataset.full_init() + self.assertEqual(len(dataset), 1) + self.check_data_info_keys(dataset[0]) From cf43e2fb1722627aedcd28b3bf756f2181b87d6b Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 19 Sep 2023 11:19:34 +0800 Subject: [PATCH 18/21] fix ut --- ...ose-lift_motionbert-ft_8xb32-120e_ubody.py | 188 ------------------ mmpose/datasets/transforms/converting.py | 29 ++- 2 files changed, 12 insertions(+), 205 deletions(-) delete mode 100644 configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py diff --git a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py b/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py deleted file mode 100644 index 16546693c9..0000000000 --- a/configs/body_3d_keypoint/pose_lift/ubody/pose-lift_motionbert-ft_8xb32-120e_ubody.py +++ /dev/null @@ -1,188 +0,0 @@ -_base_ = ['../../../_base_/default_runtime.py'] - -vis_backends = [ - dict(type='LocalVisBackend'), -] -visualizer = dict( - type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -# runtime -train_cfg = dict(max_epochs=120, val_interval=10) - -# optimizer -optim_wrapper = dict( - optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) - -# learning policy -param_scheduler = [ - dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True) -] - -auto_scale_lr = dict(base_batch_size=512) - -# hooks -default_hooks = dict( - checkpoint=dict( - type='CheckpointHook', - save_best='MPJPE', - rule='less', - max_keep_ckpts=1), - logger=dict(type='LoggerHook', interval=20), -) - -# codec settings -train_codec = dict( - type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') -val_codec = dict( - type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) - -# model settings -model = dict( - type='PoseLifter', - backbone=dict( - type='DSTFormer', - in_channels=3, - feat_size=512, - depth=5, - num_heads=8, - mlp_ratio=2, - seq_len=120, - att_fuse=True, - ), - head=dict( - type='MotionRegressionHead', - in_channels=512, - out_channels=3, - embedding_size=512, - loss=dict(type='MPJPEVelocityJointLoss'), - decoder=val_codec, - ), - test_cfg=dict(flip_test=True), - init_cfg=dict( - type='Pretrained', - checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/' - 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'), -) - -# base dataset settings -dataset_type = 'UBody3dDataset' -data_mode = 'topdown' -data_root = 'data/UBody/' - -# mapping -ubody_h36m = [ - (0, 0), - (2, 1), - (4, 2), - (6, 3), - (1, 4), - (3, 5), - (5, 6), - ((0, 7), 7), - (7, 8), - ((7, 24), 9), - (24, 10), - (8, 11), - (10, 12), - (12, 13), - (9, 14), - (11, 15), - (13, 16), -] - -scenes = [ - 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', - 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', - 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' -] - -train_datasets = [] -val_datasets = [] - -for scene in scenes: - train_dataset = dict( - type=dataset_type, - data_root=data_root, - ann_file=f'annotations/{scene}/train_3dkeypoint_annotation.json', - seq_len=1, - multiple_target=120, - multiple_target_step=60, - data_prefix=dict(img='images/'), - pipeline=[], - ) - if scene in ['Speech', 'Movie']: - continue - val_dataset = dict( - type=dataset_type, - ann_file=f'annotations/{scene}/val_3dkeypoint_annotation.json', - seq_len=1, - seq_step=1, - multiple_target=120, - data_root=data_root, - data_prefix=dict(img='images/'), - pipeline=[], - test_mode=True, - ) - train_datasets.append(train_dataset) - val_datasets.append(val_dataset) - -# pipelines -train_pipeline = [ - dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m), - dict(type='GenerateTarget', encoder=train_codec), - dict( - type='RandomFlipAroundRoot', - keypoints_flip_cfg=dict(center_mode='static', center_x=0.), - target_flip_cfg=dict(center_mode='static', center_x=0.), - flip_label=True), - dict( - type='PackPoseInputs', - meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', - 'factor', 'camera_param')) -] -val_pipeline = [ - dict(type='KeypointConverter', num_keypoints=17, mapping=ubody_h36m), - dict(type='GenerateTarget', encoder=val_codec), - dict( - type='PackPoseInputs', - meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', - 'factor', 'camera_param')) -] - -# data loaders -train_dataloader = dict( - batch_size=32, - prefetch_factor=4, - pin_memory=True, - num_workers=2, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='CombinedDataset', - datasets=train_datasets, - metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), - pipeline=train_pipeline, - test_mode=False)) - -val_dataloader = dict( - batch_size=32, - prefetch_factor=4, - pin_memory=True, - num_workers=2, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), - dataset=dict( - type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/ubody3d.py'), - datasets=val_datasets, - pipeline=val_pipeline, - test_mode=True, - )) -test_dataloader = val_dataloader - -# evaluators -val_evaluator = [ - dict(type='SimpleMPJPE', mode='mpjpe'), - dict(type='SimpleMPJPE', mode='p-mpjpe') -] -test_evaluator = val_evaluator diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py index 457234c4ee..d47829d06e 100644 --- a/mmpose/datasets/transforms/converting.py +++ b/mmpose/datasets/transforms/converting.py @@ -91,14 +91,13 @@ def transform(self, results: dict) -> dict: num_instances = results['keypoints'].shape[0] # Initialize output arrays - keypoints = np.zeros((num_instances, self.num_keypoints, 2), - dtype=np.float32) - keypoints_visible = np.zeros((num_instances, self.num_keypoints), - dtype=np.float32) - keypoints_3d = None + keypoints = np.zeros((num_instances, self.num_keypoints, 2)) + keypoints_visible = np.zeros((num_instances, self.num_keypoints)) + if 'keypoints_3d' in results: keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3), dtype=np.float32) + flip_indices = results.get('flip_indices', None) # Create a mask to weight visibility loss keypoints_visible_weights = keypoints_visible.copy() @@ -119,6 +118,13 @@ def transform(self, results: dict) -> dict: results['keypoints_3d'][:, self.source_index] + results['keypoints_3d'][:, self.source_index2]) + # Flip keypoints if flip_indices provided + if flip_indices is not None: + for i, (x1, x2) in enumerate( + zip(self.source_index, self.source_index2)): + id = flip_indices[x1] if x1 == x2 else i + flip_indices[i] = id if id < self.num_keypoints else i + flip_indices = flip_indices[:len(self.source_index)] # Otherwise just copy from the source index else: keypoints[:, @@ -139,19 +145,8 @@ def transform(self, results: dict) -> dict: results['lifting_target'] = keypoints_3d[results['target_idx']] results['lifting_target_visible'] = keypoints_visible[ results['target_idx']] + results['flip_indices'] = flip_indices - # Updatae flip pairs - if 'flip_indices' in results: - flip_indices = [] - for i in range(len(self.target_index)): - x1, x2 = self.source_index[i], self.source_index2[i] - if x1 == x2: - flip_id = results['flip_indices'][x1] - flip_id = flip_id if flip_id < self.num_keypoints else i - flip_indices.append(flip_id) - else: - flip_indices.append(i) - results['flip_indices'] = flip_indices return results def transform_sigmas(self, sigmas: Union[List, np.ndarray]): From 748432d047b65cb7c9bffd4828e0a8e56feb0dc9 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 19 Sep 2023 15:02:53 +0800 Subject: [PATCH 19/21] add preparation doc --- configs/_base_/datasets/ubody3d.py | 150 +++++++++++++----------- docs/en/dataset_zoo/3d_body_keypoint.md | 98 ++++++++++++++++ 2 files changed, 181 insertions(+), 67 deletions(-) diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py index e2dfe0c570..9242559ea1 100644 --- a/configs/_base_/datasets/ubody3d.py +++ b/configs/_base_/datasets/ubody3d.py @@ -169,7 +169,7 @@ swap='R_Thumb_3'), 28: dict( - name='L_Thumb4', + name='L_Thumb_4', id=28, color=[255, 128, 0], type='', @@ -313,7 +313,7 @@ id=48, color=[255, 128, 0], type='', - swap='L_Thumb4'), + swap='L_Thumb_4'), 49: dict( name='R_Index_1', @@ -872,71 +872,87 @@ swap='Face_56'), }, skeleton_info={ - 0: dict(link=('L_Hip', 'R_Hip'), id=0, color=[0, 255, 0]), - 1: dict(link=('L_Knee', 'R_Knee'), id=1, color=[0, 255, 0]), - 2: dict(link=('L_Ankle', 'R_Ankle'), id=2, color=[0, 255, 0]), - 3: dict(link=('L_Shoulder', 'R_Shoulder'), id=3, color=[0, 255, 0]), - 4: dict(link=('L_Elbow', 'R_Elbow'), id=4, color=[0, 255, 0]), - 5: dict(link=('L_Wrist', 'R_Wrist'), id=5, color=[0, 255, 0]), - 6: dict(link=('L_Big_toe', 'R_Big_toe'), id=6, color=[0, 255, 0]), - 7: dict(link=('L_Small_toe', 'R_Small_toe'), id=7, color=[0, 255, 0]), - 8: dict(link=('L_Heel', 'R_Heel'), id=8, color=[0, 255, 0]), - 9: dict(link=('L_Ear', 'R_Ear'), id=9, color=[0, 255, 0]), - 10: dict(link=('L_Eye', 'R_Eye'), id=10, color=[0, 255, 0]), - 11: dict(link=('L_Thumb_1', 'R_Thumb_1'), id=11, color=[255, 128, 0]), - 12: dict(link=('L_Thumb_2', 'R_Thumb_2'), id=12, color=[255, 128, 0]), - 13: dict(link=('L_Thumb_3', 'R_Thumb_3'), id=13, color=[255, 128, 0]), - 14: dict(link=('L_Thumb4', 'R_Thumb_4'), id=14, color=[255, 128, 0]), - 15: dict(link=('L_Index_1', 'R_Index_1'), id=15, color=[255, 128, 0]), - 16: dict(link=('L_Index_2', 'R_Index_2'), id=16, color=[255, 128, 0]), - 17: dict(link=('L_Index_3', 'R_Index_3'), id=17, color=[255, 128, 0]), - 18: dict(link=('L_Index_4', 'R_Index_4'), id=18, color=[255, 128, 0]), - 19: - dict(link=('L_Middle_1', 'R_Middle_1'), id=19, color=[255, 128, 0]), - 20: - dict(link=('L_Middle_2', 'R_Middle_2'), id=20, color=[255, 128, 0]), - 21: - dict(link=('L_Middle_3', 'R_Middle_3'), id=21, color=[255, 128, 0]), - 22: - dict(link=('L_Middle_4', 'R_Middle_4'), id=22, color=[255, 128, 0]), - 23: dict(link=('L_Ring_1', 'R_Ring_1'), id=23, color=[255, 128, 0]), - 24: dict(link=('L_Ring_2', 'R_Ring_2'), id=24, color=[255, 128, 0]), - 25: dict(link=('L_Ring_3', 'R_Ring_3'), id=25, color=[255, 128, 0]), - 26: dict(link=('L_Ring_4', 'R_Ring_4'), id=26, color=[255, 128, 0]), - 27: dict(link=('L_Pinky_1', 'R_Pinky_1'), id=27, color=[255, 128, 0]), - 28: dict(link=('L_Pinky_2', 'R_Pinky_2'), id=28, color=[255, 128, 0]), - 29: dict(link=('L_Pinky_3', 'R_Pinky_3'), id=29, color=[255, 128, 0]), - 30: dict(link=('L_Pinky_4', 'R_Pinky_4'), id=30, color=[255, 128, 0]), - 31: dict(link=('Face_3', 'Face_4'), id=31, color=[255, 255, 255]), - 32: dict(link=('Face_5', 'Face_14'), id=32, color=[255, 255, 255]), - 33: dict(link=('Face_6', 'Face_13'), id=33, color=[255, 255, 255]), - 34: dict(link=('Face_7', 'Face_12'), id=34, color=[255, 255, 255]), - 35: dict(link=('Face_8', 'Face_11'), id=35, color=[255, 255, 255]), - 36: dict(link=('Face_9', 'Face_10'), id=36, color=[255, 255, 255]), - 37: dict(link=('Face_19', 'Face_23'), id=37, color=[255, 255, 255]), - 38: dict(link=('Face_20', 'Face_22'), id=38, color=[255, 255, 255]), - 39: dict(link=('Face_24', 'Face_33'), id=39, color=[255, 255, 255]), - 40: dict(link=('Face_25', 'Face_32'), id=40, color=[255, 255, 255]), - 41: dict(link=('Face_26', 'Face_31'), id=41, color=[255, 255, 255]), - 42: dict(link=('Face_27', 'Face_30'), id=42, color=[255, 255, 255]), - 43: dict(link=('Face_28', 'Face_35'), id=43, color=[255, 255, 255]), - 44: dict(link=('Face_29', 'Face_34'), id=44, color=[255, 255, 255]), - 45: dict(link=('Face_36', 'Face_42'), id=45, color=[255, 255, 255]), - 46: dict(link=('Face_37', 'Face_41'), id=46, color=[255, 255, 255]), - 47: dict(link=('Face_38', 'Face_40'), id=47, color=[255, 255, 255]), - 48: dict(link=('Face_43', 'Face_47'), id=48, color=[255, 255, 255]), - 49: dict(link=('Face_44', 'Face_46'), id=49, color=[255, 255, 255]), - 50: dict(link=('Face_48', 'Face_52'), id=50, color=[255, 255, 255]), - 51: dict(link=('Face_49', 'Face_51'), id=51, color=[255, 255, 255]), - 52: dict(link=('Face_53', 'Face_55'), id=52, color=[255, 255, 255]), - 53: dict(link=('Face_56', 'Face_72'), id=53, color=[255, 255, 255]), - 54: dict(link=('Face_57', 'Face_71'), id=54, color=[255, 255, 255]), - 55: dict(link=('Face_58', 'Face_70'), id=55, color=[255, 255, 255]), - 56: dict(link=('Face_59', 'Face_69'), id=56, color=[255, 255, 255]), - 57: dict(link=('Face_60', 'Face_68'), id=57, color=[255, 255, 255]), - 58: dict(link=('Face_61', 'Face_67'), id=58, color=[255, 255, 255]), - 59: dict(link=('Face_62', 'Face_66'), id=59, color=[255, 255, 255]), - 60: dict(link=('Face_63', 'Face_65'), id=60, color=[255, 255, 255]), + 0: dict(link=('L_Ankle', 'L_Knee'), id=0, color=[0, 255, 0]), + 1: dict(link=('L_Knee', 'L_Hip'), id=1, color=[0, 255, 0]), + 2: dict(link=('R_Ankle', 'R_Knee'), id=2, color=[0, 255, 0]), + 3: dict(link=('R_Knee', 'R_Hip'), id=3, color=[0, 255, 0]), + 4: dict(link=('L_Hip', 'R_Hip'), id=4, color=[0, 255, 0]), + 5: dict(link=('L_Shoulder', 'L_Hip'), id=5, color=[0, 255, 0]), + 6: dict(link=('R_Shoulder', 'R_Hip'), id=6, color=[0, 255, 0]), + 7: dict(link=('L_Shoulder', 'R_Shoulder'), id=7, color=[0, 255, 0]), + 8: dict(link=('L_Shoulder', 'L_Elbow'), id=8, color=[0, 255, 0]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=9, color=[0, 255, 0]), + 10: dict(link=('L_Elbow', 'L_Wrist'), id=10, color=[0, 255, 0]), + 11: dict(link=('R_Elbow', 'R_Wrist'), id=11, color=[255, 128, 0]), + 12: dict(link=('L_Eye', 'R_Eye'), id=12, color=[255, 128, 0]), + 13: dict(link=('Nose', 'L_Eye'), id=13, color=[255, 128, 0]), + 14: dict(link=('Nose', 'R_Eye'), id=14, color=[255, 128, 0]), + 15: dict(link=('L_Eye', 'L_Ear'), id=15, color=[255, 128, 0]), + 16: dict(link=('R_Eye', 'R_Ear'), id=16, color=[255, 128, 0]), + 17: dict(link=('L_Ear', 'L_Shoulder'), id=17, color=[255, 128, 0]), + 18: dict(link=('R_Ear', 'R_Shoulder'), id=18, color=[255, 128, 0]), + 19: dict(link=('L_Ankle', 'L_Big_toe'), id=19, color=[255, 128, 0]), + 20: dict(link=('L_Ankle', 'L_Small_toe'), id=20, color=[255, 128, 0]), + 21: dict(link=('L_Ankle', 'L_Heel'), id=21, color=[255, 128, 0]), + 22: dict(link=('R_Ankle', 'R_Big_toe'), id=22, color=[255, 128, 0]), + 23: dict(link=('R_Ankle', 'R_Small_toe'), id=23, color=[255, 128, 0]), + 24: dict(link=('R_Ankle', 'R_Heel'), id=24, color=[255, 128, 0]), + 25: dict(link=('L_Wrist', 'L_Thumb_1'), id=25, color=[255, 128, 0]), + 26: dict(link=('L_Thumb_1', 'L_Thumb_2'), id=26, color=[255, 128, 0]), + 27: dict(link=('L_Thumb_2', 'L_Thumb_3'), id=27, color=[255, 128, 0]), + 28: dict(link=('L_Thumb_3', 'L_Thumb_4'), id=28, color=[255, 128, 0]), + 29: dict(link=('L_Wrist', 'L_Index_1'), id=29, color=[255, 128, 0]), + 30: dict(link=('L_Index_1', 'L_Index_2'), id=30, color=[255, 128, 0]), + 31: + dict(link=('L_Index_2', 'L_Index_3'), id=31, color=[255, 255, 255]), + 32: + dict(link=('L_Index_3', 'L_Index_4'), id=32, color=[255, 255, 255]), + 33: dict(link=('L_Wrist', 'L_Middle_1'), id=33, color=[255, 255, 255]), + 34: + dict(link=('L_Middle_1', 'L_Middle_2'), id=34, color=[255, 255, 255]), + 35: + dict(link=('L_Middle_2', 'L_Middle_3'), id=35, color=[255, 255, 255]), + 36: + dict(link=('L_Middle_3', 'L_Middle_4'), id=36, color=[255, 255, 255]), + 37: dict(link=('L_Wrist', 'L_Ring_1'), id=37, color=[255, 255, 255]), + 38: dict(link=('L_Ring_1', 'L_Ring_2'), id=38, color=[255, 255, 255]), + 39: dict(link=('L_Ring_2', 'L_Ring_3'), id=39, color=[255, 255, 255]), + 40: dict(link=('L_Ring_3', 'L_Ring_4'), id=40, color=[255, 255, 255]), + 41: dict(link=('L_Wrist', 'L_Pinky_1'), id=41, color=[255, 255, 255]), + 42: + dict(link=('L_Pinky_1', 'L_Pinky_2'), id=42, color=[255, 255, 255]), + 43: + dict(link=('L_Pinky_2', 'L_Pinky_3'), id=43, color=[255, 255, 255]), + 44: + dict(link=('L_Pinky_3', 'L_Pinky_4'), id=44, color=[255, 255, 255]), + 45: dict(link=('R_Wrist', 'R_Thumb_1'), id=45, color=[255, 255, 255]), + 46: + dict(link=('R_Thumb_1', 'R_Thumb_2'), id=46, color=[255, 255, 255]), + 47: + dict(link=('R_Thumb_2', 'R_Thumb_3'), id=47, color=[255, 255, 255]), + 48: + dict(link=('R_Thumb_3', 'R_Thumb_4'), id=48, color=[255, 255, 255]), + 49: dict(link=('R_Wrist', 'R_Index_1'), id=49, color=[255, 255, 255]), + 50: + dict(link=('R_Index_1', 'R_Index_2'), id=50, color=[255, 255, 255]), + 51: + dict(link=('R_Index_2', 'R_Index_3'), id=51, color=[255, 255, 255]), + 52: + dict(link=('R_Index_3', 'R_Index_4'), id=52, color=[255, 255, 255]), + 53: dict(link=('R_Wrist', 'R_Middle_1'), id=53, color=[255, 255, 255]), + 54: + dict(link=('R_Middle_1', 'R_Middle_2'), id=54, color=[255, 255, 255]), + 55: + dict(link=('R_Middle_2', 'R_Middle_3'), id=55, color=[255, 255, 255]), + 56: + dict(link=('R_Middle_3', 'R_Middle_4'), id=56, color=[255, 255, 255]), + 57: dict(link=('R_Wrist', 'R_Pinky_1'), id=57, color=[255, 255, 255]), + 58: + dict(link=('R_Pinky_1', 'R_Pinky_2'), id=58, color=[255, 255, 255]), + 59: + dict(link=('R_Pinky_2', 'R_Pinky_3'), id=59, color=[255, 255, 255]), + 60: + dict(link=('R_Pinky_3', 'R_Pinky_4'), id=60, color=[255, 255, 255]), }, joint_weights=[1.] * 137, sigmas=[]) diff --git a/docs/en/dataset_zoo/3d_body_keypoint.md b/docs/en/dataset_zoo/3d_body_keypoint.md index 82e21010fc..3a35e2443b 100644 --- a/docs/en/dataset_zoo/3d_body_keypoint.md +++ b/docs/en/dataset_zoo/3d_body_keypoint.md @@ -8,6 +8,7 @@ MMPose supported datasets: - [Human3.6M](#human36m) \[ [Homepage](http://vision.imar.ro/human3.6m/description.php) \] - [CMU Panoptic](#cmu-panoptic) \[ [Homepage](http://domedb.perception.cs.cmu.edu/) \] - [Campus/Shelf](#campus-and-shelf) \[ [Homepage](http://campar.in.tum.de/Chair/MultiHumanPose) \] +- [UBody](#ubody3d) \[ [Homepage](https://osx-ubody.github.io/) \] ## Human3.6M @@ -197,3 +198,100 @@ mmpose | ├── pred_shelf_maskrcnn_hrnet_coco.pkl | ├── actorsGT.mat ``` + +## UBody3d + +
+UBody (CVPR'2023) + +```bibtex +@article{lin2023one, + title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer}, + author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + year={2023}, +} +``` + +
+ +
+ +
+ +For [Ubody](https://github.com/IDEA-Research/OSX) dataset, videos and annotations can be downloaded from [OSX homepage](https://github.com/IDEA-Research/OSX). + +Download and extract them under $MMPOSE/data, and make them look like this: + +```text +mmpose +├── mmpose +├── docs +├── tests +├── tools +├── configs +`── data + │── UBody + ├── annotations + │   ├── ConductMusic + │   ├── Entertainment + │   ├── Fitness + │   ├── Interview + │   ├── LiveVlog + │   ├── Magic_show + │   ├── Movie + │   ├── Olympic + │   ├── Online_class + │   ├── SignLanguage + │   ├── Singing + │   ├── Speech + │   ├── TVShow + │   ├── TalkShow + │   └── VideoConference + ├── splits + │   ├── inter_scene_test_list.npy + │   └── intra_scene_test_list.npy + ├── videos + │   ├── ConductMusic + │   ├── Entertainment + │   ├── Fitness + │   ├── Interview + │   ├── LiveVlog + │   ├── Magic_show + │   ├── Movie + │   ├── Olympic + │   ├── Online_class + │   ├── SignLanguage + │   ├── Singing + │   ├── Speech + │   ├── TVShow + │   ├── TalkShow + │   └── VideoConference +``` + +Convert videos to images then split them into train/val set: + +```shell +python tools/dataset_converters/ubody_kpts_to_coco.py +``` + +Before generating 3D keypoints, you need to install SMPLX tools and download human models, please refer to [Github](https://github.com/vchoutas/smplx#installation) and [SMPLX](https://smpl-x.is.tue.mpg.de/download.php). + +```shell +pip install smplx +``` + +The directory tree of human models should be like this: + +```text +human_model_path +|── smplx + ├── SMPLX_NEUTRAL.npz + ├── SMPLX_NEUTRAL.pkl +``` + +After the above preparations are finished, execute the following script: + +```shell +python tools/dataset_converters/ubody_smplx_to_coco.py --data-root {$MMPOSE/data/UBody} --human-model-path {$MMPOSE/data/human_model_path/} +``` From b296a2aec8b6ba4d9a6dd80ef4cf33c376d65234 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Tue, 19 Sep 2023 19:24:07 +0800 Subject: [PATCH 20/21] update keypoint convert --- mmpose/datasets/transforms/converting.py | 39 ++++++++---------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py index d47829d06e..90536bd736 100644 --- a/mmpose/datasets/transforms/converting.py +++ b/mmpose/datasets/transforms/converting.py @@ -91,12 +91,10 @@ def transform(self, results: dict) -> dict: num_instances = results['keypoints'].shape[0] # Initialize output arrays - keypoints = np.zeros((num_instances, self.num_keypoints, 2)) + keypoints = np.zeros((num_instances, self.num_keypoints, 3)) keypoints_visible = np.zeros((num_instances, self.num_keypoints)) + key = 'keypoints_3d' if 'keypoints_3d' in results else 'keypoints' - if 'keypoints_3d' in results: - keypoints_3d = np.zeros((num_instances, self.num_keypoints, 3), - dtype=np.float32) flip_indices = results.get('flip_indices', None) # Create a mask to weight visibility loss @@ -106,43 +104,32 @@ def transform(self, results: dict) -> dict: # Interpolate keypoints if pairs of source indexes provided if self.interpolation: keypoints[:, self.target_index] = 0.5 * ( - results['keypoints'][:, self.source_index] + - results['keypoints'][:, self.source_index2]) - + results[key][:, self.source_index] + + results[key][:, self.source_index2]) keypoints_visible[:, self.target_index] = results[ - 'keypoints_visible'][:, self.source_index] * \ - results['keypoints_visible'][:, self.source_index2] - - if 'keypoints_3d' in results: - keypoints_3d[:, self.target_index] = 0.5 * ( - results['keypoints_3d'][:, self.source_index] + - results['keypoints_3d'][:, self.source_index2]) - + 'keypoints_visible'][:, self.source_index] * results[ + 'keypoints_visible'][:, self.source_index2] # Flip keypoints if flip_indices provided if flip_indices is not None: for i, (x1, x2) in enumerate( zip(self.source_index, self.source_index2)): - id = flip_indices[x1] if x1 == x2 else i - flip_indices[i] = id if id < self.num_keypoints else i + idx = flip_indices[x1] if x1 == x2 else i + flip_indices[i] = idx if idx < self.num_keypoints else i flip_indices = flip_indices[:len(self.source_index)] # Otherwise just copy from the source index else: - keypoints[:, - self.target_index] = results['keypoints'][:, self. - source_index] + keypoints[:, self.target_index] = results[key][:, + self.source_index] keypoints_visible[:, self.target_index] = results[ 'keypoints_visible'][:, self.source_index] - if 'keypoints_3d' in results: - keypoints_3d[:, self.target_index] = results[ - 'keypoints_3d'][:, self.source_index] # Update the results dict - results['keypoints'] = keypoints + results['keypoints'] = keypoints[..., :2] results['keypoints_visible'] = np.stack( [keypoints_visible, keypoints_visible_weights], axis=2) if 'keypoints_3d' in results: - results['keypoints_3d'] = keypoints_3d - results['lifting_target'] = keypoints_3d[results['target_idx']] + results['keypoints_3d'] = keypoints + results['lifting_target'] = keypoints[results['target_idx']] results['lifting_target_visible'] = keypoints_visible[ results['target_idx']] results['flip_indices'] = flip_indices From 693f087bfb28b7b4aa0b9bbf0b2c135e358dcbce Mon Sep 17 00:00:00 2001 From: xiexinch Date: Wed, 20 Sep 2023 10:31:56 +0800 Subject: [PATCH 21/21] fix 2d kpts --- mmpose/datasets/transforms/converting.py | 8 +++--- .../test_transforms/test_converting.py | 27 +++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py index 90536bd736..1906f16972 100644 --- a/mmpose/datasets/transforms/converting.py +++ b/mmpose/datasets/transforms/converting.py @@ -94,6 +94,7 @@ def transform(self, results: dict) -> dict: keypoints = np.zeros((num_instances, self.num_keypoints, 3)) keypoints_visible = np.zeros((num_instances, self.num_keypoints)) key = 'keypoints_3d' if 'keypoints_3d' in results else 'keypoints' + c = results[key].shape[-1] flip_indices = results.get('flip_indices', None) @@ -103,7 +104,7 @@ def transform(self, results: dict) -> dict: # Interpolate keypoints if pairs of source indexes provided if self.interpolation: - keypoints[:, self.target_index] = 0.5 * ( + keypoints[:, self.target_index, :c] = 0.5 * ( results[key][:, self.source_index] + results[key][:, self.source_index2]) keypoints_visible[:, self.target_index] = results[ @@ -118,8 +119,9 @@ def transform(self, results: dict) -> dict: flip_indices = flip_indices[:len(self.source_index)] # Otherwise just copy from the source index else: - keypoints[:, self.target_index] = results[key][:, - self.source_index] + keypoints[:, + self.target_index, :c] = results[key][:, + self.source_index] keypoints_visible[:, self.target_index] = results[ 'keypoints_visible'][:, self.source_index] diff --git a/tests/test_datasets/test_transforms/test_converting.py b/tests/test_datasets/test_transforms/test_converting.py index 5cce813b70..dc4376baf9 100644 --- a/tests/test_datasets/test_transforms/test_converting.py +++ b/tests/test_datasets/test_transforms/test_converting.py @@ -81,6 +81,33 @@ def test_transform(self): self.data_info['keypoints_visible'][:, source_index]).all()) + # check 3d keypoint + self.data_info['keypoints_3d'] = np.random.random((4, 17, 3)) + self.data_info['target_idx'] = [-1] + mapping = [(3, 0), (6, 1), (16, 2), (5, 3)] + transform = KeypointConverter(num_keypoints=5, mapping=mapping) + results = transform(self.data_info.copy()) + + # check shape + self.assertEqual(results['keypoints_3d'].shape[0], + self.data_info['keypoints_3d'].shape[0]) + self.assertEqual(results['keypoints_3d'].shape[1], 5) + self.assertEqual(results['keypoints_3d'].shape[2], 3) + self.assertEqual(results['keypoints_visible'].shape[0], + self.data_info['keypoints_visible'].shape[0]) + self.assertEqual(results['keypoints_visible'].shape[1], 5) + + # check value + for source_index, target_index in mapping: + self.assertTrue( + (results['keypoints_3d'][:, target_index] == + self.data_info['keypoints_3d'][:, source_index]).all()) + self.assertEqual(results['keypoints_visible'].ndim, 3) + self.assertEqual(results['keypoints_visible'].shape[2], 2) + self.assertTrue( + (results['keypoints_visible'][:, target_index, 0] == + self.data_info['keypoints_visible'][:, source_index]).all()) + def test_transform_sigmas(self): mapping = [(3, 0), (6, 1), (16, 2), (5, 3)]