[Feature] Support ubody3d dataset (#2699)

open-mmlab · Sep 20, 2023 · 1d1350e · 1d1350e
1 parent aa8ab14
commit 1d1350e
Show file tree

Hide file tree

Showing 12 changed files with 2,002 additions and 15 deletions.
diff --git a/configs/_base_/datasets/ubody3d.py b/configs/_base_/datasets/ubody3d.py
diff --git a/docs/en/dataset_zoo/3d_body_keypoint.md b/docs/en/dataset_zoo/3d_body_keypoint.md
@@ -8,6 +8,7 @@ MMPose supported datasets:
 - [Human3.6M](#human36m) \[ [Homepage](http://vision.imar.ro/human3.6m/description.php) \]
 - [CMU Panoptic](#cmu-panoptic) \[ [Homepage](http://domedb.perception.cs.cmu.edu/) \]
 - [Campus/Shelf](#campus-and-shelf) \[ [Homepage](http://campar.in.tum.de/Chair/MultiHumanPose) \]
+- [UBody](#ubody3d) \[ [Homepage](https://osx-ubody.github.io/) \]
 
 ## Human3.6M
 
@@ -197,3 +198,100 @@ mmpose
     |   ├── pred_shelf_maskrcnn_hrnet_coco.pkl
     |   ├── actorsGT.mat
 ```
+
+## UBody3d
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2303.16160">UBody (CVPR'2023)</a></summary>
+
+```bibtex
+@article{lin2023one,
+  title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer},
+  author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2023},
+}
+```
+
+</details>
+
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmpose/assets/15952744/0c97e43a-46a9-46a3-a5dd-b84bf9d6d6f2" height="300px">
+</div>
+
+For [Ubody](https://github.com/IDEA-Research/OSX) dataset, videos and annotations can be downloaded from [OSX homepage](https://github.com/IDEA-Research/OSX).
+
+Download and extract them under $MMPOSE/data, and make them look like this:
+
+```text
+mmpose
+├── mmpose
+├── docs
+├── tests
+├── tools
+├── configs
+`── data
+    │── UBody
+        ├── annotations
+        │   ├── ConductMusic
+        │   ├── Entertainment
+        │   ├── Fitness
+        │   ├── Interview
+        │   ├── LiveVlog
+        │   ├── Magic_show
+        │   ├── Movie
+        │   ├── Olympic
+        │   ├── Online_class
+        │   ├── SignLanguage
+        │   ├── Singing
+        │   ├── Speech
+        │   ├── TVShow
+        │   ├── TalkShow
+        │   └── VideoConference
+        ├── splits
+        │   ├── inter_scene_test_list.npy
+        │   └── intra_scene_test_list.npy
+        ├── videos
+        │   ├── ConductMusic
+        │   ├── Entertainment
+        │   ├── Fitness
+        │   ├── Interview
+        │   ├── LiveVlog
+        │   ├── Magic_show
+        │   ├── Movie
+        │   ├── Olympic
+        │   ├── Online_class
+        │   ├── SignLanguage
+        │   ├── Singing
+        │   ├── Speech
+        │   ├── TVShow
+        │   ├── TalkShow
+        │   └── VideoConference
+```
+
+Convert videos to images then split them into train/val set:
+
+```shell
+python tools/dataset_converters/ubody_kpts_to_coco.py
+```
+
+Before generating 3D keypoints, you need to install SMPLX tools and download human models, please refer to [Github](https://github.com/vchoutas/smplx#installation) and [SMPLX](https://smpl-x.is.tue.mpg.de/download.php).
+
+```shell
+pip install smplx
+```
+
+The directory tree of human models should be like this:
+
+```text
+human_model_path
+|── smplx
+    ├── SMPLX_NEUTRAL.npz
+    ├── SMPLX_NEUTRAL.pkl
+```
+
+After the above preparations are finished, execute the following script:
+
+```shell
+python tools/dataset_converters/ubody_smplx_to_coco.py --data-root {$MMPOSE/data/UBody} --human-model-path {$MMPOSE/data/human_model_path/}
+```
diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py
@@ -96,8 +96,7 @@ def __init__(self,
         assert exists(_ann_file), (
             f'Annotation file `{_ann_file}` does not exist.')
 
-        with get_local_path(_ann_file) as local_path:
-            self.ann_data = np.load(local_path)
+        self._load_ann_file(_ann_file)
 
         self.camera_param_file = camera_param_file
         if self.camera_param_file:
@@ -137,6 +136,19 @@ def __init__(self,
             lazy_init=lazy_init,
             max_refetch=max_refetch)
 
+    def _load_ann_file(self, ann_file: str) -> dict:
+        """Load annotation file to get image information.
+
+        Args:
+            ann_file (str): Annotation file path.
+
+        Returns:
+            dict: Annotation information.
+        """
+
+        with get_local_path(ann_file) as local_path:
+            self.ann_data = np.load(local_path)
+
     @classmethod
     def _load_metainfo(cls, metainfo: dict = None) -> dict:
         """Collect meta information from the dictionary of meta.

diff --git a/mmpose/datasets/datasets/body3d/__init__.py b/mmpose/datasets/datasets/body3d/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .h36m_dataset import Human36mDataset
+from .ubody3d_dataset import UBody3dDataset
 
-__all__ = ['Human36mDataset']
+__all__ = ['Human36mDataset', 'UBody3dDataset']
diff --git a/mmpose/datasets/datasets/body3d/ubody3d_dataset.py b/mmpose/datasets/datasets/body3d/ubody3d_dataset.py
@@ -0,0 +1,247 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import defaultdict
+from typing import List, Tuple
+
+import numpy as np
+from mmengine.fileio import get_local_path
+from xtcocotools.coco import COCO
+
+from mmpose.datasets.datasets import BaseMocapDataset
+from mmpose.registry import DATASETS
+
+
+@DATASETS.register_module()
+class UBody3dDataset(BaseMocapDataset):
+    """Ubody3d dataset for 3D human pose estimation.
+
+    "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer",
+    CVPR'2023. More details can be found in the `paper
+    <https://arxiv.org/abs/2303.16160>`__ .
+
+    Ubody3D keypoints::
+
+        0-24: 25 body keypoints,
+        25-64: 40 hand keypoints,
+        65-136: 72 face keypoints,
+
+        In total, we have 137 keypoints for wholebody 3D pose estimation.
+
+    Args:
+        ann_file (str): Annotation file path. Default: ''.
+        seq_len (int): Number of frames in a sequence. Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
+        causal (bool): If set to ``True``, the rightmost input frame will be
+            the target frame. Otherwise, the middle input frame will be the
+            target frame. Default: ``True``.
+        subset_frac (float): The fraction to reduce dataset size. If set to 1,
+            the dataset size is not reduced. Default: 1.
+        camera_param_file (str): Cameras' parameters file. Default: ``None``.
+        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
+            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
+            one instance; while in ``'bottomup'`` mode, each data sample
+            contains all instances in a image. Default: ``'topdown'``
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Default: ``None``.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Default: ``None``.
+        data_prefix (dict, optional): Prefix for training data.
+            Default: ``dict(img='')``.
+        filter_cfg (dict, optional): Config for filter data. Default: `None`.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Default: ``None`` which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy.
+            Default: ``True``.
+        pipeline (list, optional): Processing pipeline. Default: [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Default: ``False``.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Default: ``False``.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Default: 1000.
+    """
+
+    def __init__(self,
+                 multiple_target: int = 0,
+                 multiple_target_step: int = 0,
+                 seq_step: int = 1,
+                 pad_video_seq: bool = False,
+                 **kwargs):
+        self.seq_step = seq_step
+        self.pad_video_seq = pad_video_seq
+
+        if multiple_target > 0 and multiple_target_step == 0:
+            multiple_target_step = multiple_target
+        self.multiple_target_step = multiple_target_step
+
+        super().__init__(multiple_target=multiple_target, **kwargs)
+
+    METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py')
+
+    def _load_ann_file(self, ann_file: str) -> dict:
+        """Load annotation file."""
+        with get_local_path(ann_file) as local_path:
+            self.ann_data = COCO(local_path)
+
+    def get_sequence_indices(self) -> List[List[int]]:
+        video_frames = defaultdict(list)
+        img_ids = self.ann_data.getImgIds()
+        for img_id in img_ids:
+            img_info = self.ann_data.loadImgs(img_id)[0]
+            subj, _, _ = self._parse_image_name(img_info['file_name'])
+            video_frames[subj].append(img_id)
+
+        sequence_indices = []
+        _len = (self.seq_len - 1) * self.seq_step + 1
+        _step = self.seq_step
+
+        if self.multiple_target:
+            for _, _img_ids in sorted(video_frames.items()):
+                n_frame = len(_img_ids)
+                _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids)
+                seqs_from_video = [
+                    _ann_ids[i:(i + self.multiple_target):_step]
+                    for i in range(0, n_frame, self.multiple_target_step)
+                ][:(n_frame + self.multiple_target_step -
+                    self.multiple_target) // self.multiple_target_step]
+                sequence_indices.extend(seqs_from_video)
+        else:
+            for _, _img_ids in sorted(video_frames.items()):
+                n_frame = len(_img_ids)
+                _ann_ids = self.ann_data.getAnnIds(imgIds=_img_ids)
+                if self.pad_video_seq:
+                    # Pad the sequence so that every frame in the sequence will
+                    # be predicted.
+                    if self.causal:
+                        frames_left = self.seq_len - 1
+                        frames_right = 0
+                    else:
+                        frames_left = (self.seq_len - 1) // 2
+                        frames_right = frames_left
+                    for i in range(n_frame):
+                        pad_left = max(0, frames_left - i // _step)
+                        pad_right = max(
+                            0, frames_right - (n_frame - 1 - i) // _step)
+                        start = max(i % _step, i - frames_left * _step)
+                        end = min(n_frame - (n_frame - 1 - i) % _step,
+                                  i + frames_right * _step + 1)
+                        sequence_indices.append([_ann_ids[0]] * pad_left +
+                                                _ann_ids[start:end:_step] +
+                                                [_ann_ids[-1]] * pad_right)
+                else:
+                    seqs_from_video = [
+                        _ann_ids[i:(i + _len):_step]
+                        for i in range(0, n_frame - _len + 1, _step)
+                    ]
+                    sequence_indices.extend(seqs_from_video)
+
+        # reduce dataset size if needed
+        subset_size = int(len(sequence_indices) * self.subset_frac)
+        start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
+        end = start + subset_size
+
+        sequence_indices = sequence_indices[start:end]
+
+        return sequence_indices
+
+    def _parse_image_name(self, image_path: str) -> Tuple[str, int]:
+        """Parse image name to get video name and frame index.
+
+        Args:
+            image_name (str): Image name.
+
+        Returns:
+            tuple[str, int]: Video name and frame index.
+        """
+        trim, file_name = image_path.split('/')[-2:]
+        frame_id, suffix = file_name.split('.')
+        return trim, frame_id, suffix
+
+    def _load_annotations(self):
+        """Load data from annotations in COCO format."""
+        num_keypoints = self.metainfo['num_keypoints']
+        self._metainfo['CLASSES'] = self.ann_data.loadCats(
+            self.ann_data.getCatIds())
+
+        instance_list = []
+        image_list = []
+
+        for i, _ann_ids in enumerate(self.sequence_indices):
+            expected_num_frames = self.seq_len
+            if self.multiple_target:
+                expected_num_frames = self.multiple_target
+
+            assert len(_ann_ids) == (expected_num_frames), (
+                f'Expected `frame_ids` == {expected_num_frames}, but '
+                f'got {len(_ann_ids)} ')
+
+            anns = self.ann_data.loadAnns(_ann_ids)
+            img_ids = []
+            kpts = np.zeros((len(anns), num_keypoints, 2), dtype=np.float32)
+            kpts_3d = np.zeros((len(anns), num_keypoints, 3), dtype=np.float32)
+            keypoints_visible = np.zeros((len(anns), num_keypoints, 1),
+                                         dtype=np.float32)
+            for j, ann in enumerate(anns):
+                img_ids.append(ann['image_id'])
+                kpts[j] = np.array(ann['keypoints'], dtype=np.float32)
+                kpts_3d[j] = np.array(ann['keypoints_3d'], dtype=np.float32)
+                keypoints_visible[j] = np.array(
+                    ann['keypoints_valid'], dtype=np.float32)
+            imgs = self.ann_data.loadImgs(img_ids)
+            keypoints_visible = keypoints_visible.squeeze(-1)
+
+            scales = np.zeros(len(imgs), dtype=np.float32)
+            centers = np.zeros((len(imgs), 2), dtype=np.float32)
+            img_paths = np.array([img['file_name'] for img in imgs])
+            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
+
+            target_idx = [-1] if self.causal else [int(self.seq_len // 2)]
+            if self.multiple_target:
+                target_idx = list(range(self.multiple_target))
+
+            cam_param = anns[-1]['camera_param']
+            if 'w' not in cam_param or 'h' not in cam_param:
+                cam_param['w'] = 1000
+                cam_param['h'] = 1000
+
+            instance_info = {
+                'num_keypoints': num_keypoints,
+                'keypoints': kpts,
+                'keypoints_3d': kpts_3d,
+                'keypoints_visible': keypoints_visible,
+                'scale': scales,
+                'center': centers,
+                'id': i,
+                'category_id': 1,
+                'iscrowd': 0,
+                'img_paths': list(img_paths),
+                'img_ids': [img['id'] for img in imgs],
+                'lifting_target': kpts_3d[target_idx],
+                'lifting_target_visible': keypoints_visible[target_idx],
+                'target_img_paths': img_paths[target_idx],
+                'camera_param': cam_param,
+                'factor': factors,
+                'target_idx': target_idx,
+            }
+
+            instance_list.append(instance_info)
+
+        for img_id in self.ann_data.getImgIds():
+            img = self.ann_data.loadImgs(img_id)[0]
+            img.update({
+                'img_id':
+                img_id,
+                'img_path':
+                osp.join(self.data_prefix['img'], img['file_name']),
+            })
+            image_list.append(img)
+
+        return instance_list, image_list