From 5a6e7269498d26e817a1330babd6d9a1115c9200 Mon Sep 17 00:00:00 2001
From: Peng Lu <penglu2097@gmail.com>
Date: Fri, 18 Nov 2022 14:19:28 +0800
Subject: [PATCH] [Enhance] Support saving predictions in topdown demos (#1814)

* support saving predictions in topdown demos

* update docs
---
 demo/docs/2d_animal_demo.md         | 14 ++++++++-
 demo/docs/2d_face_demo.md           |  4 ++-
 demo/docs/2d_hand_demo.md           |  4 ++-
 demo/docs/2d_human_pose_demo.md     |  8 ++++--
 demo/docs/2d_wholebody_pose_demo.md |  8 ++++--
 demo/topdown_demo_with_mmdet.py     | 44 ++++++++++++++++++++++++-----
 demo/topdown_face_demo.py           | 43 ++++++++++++++++++++++++----
 demo/webcam_cfg/pose_estimation.py  |  6 ++--
 mmpose/structures/__init__.py       |  4 +--
 mmpose/structures/utils.py          | 19 +++++++++++++
 10 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/demo/docs/2d_animal_demo.md b/demo/docs/2d_animal_demo.md
index 38ee3078ea..38997acd5d 100644
--- a/demo/docs/2d_animal_demo.md
+++ b/demo/docs/2d_animal_demo.md
@@ -9,7 +9,7 @@ python demo/topdown_demo_with_mmdet.py \
     ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \
     ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \
     --input ${INPUT_PATH} --det-cat-id ${DET_CAT_ID} \
-    [--show] [--output-root ${OUTPUT_DIR}]  \
+    [--show] [--output-root ${OUTPUT_DIR}] [--save-predictions] \
     [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \
     [--kpt-thr ${KPT_SCORE_THR}] [--bbox-thr ${BBOX_SCORE_THR}] \
     [--device ${GPU_ID or CPU}]
@@ -53,6 +53,18 @@ python demo/topdown_demo_with_mmdet.py \
     --output-root vis_results --draw-heatmap --det-cat-id=15
 ```
 
+To save predicted results on disk:
+
+```shell
+python demo/topdown_demo_with_mmdet.py \
+    demo/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py \
+    https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py \
+    https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth \
+    --input tests/data/animalpose/ca110.jpeg \
+    --output-root vis_results --save-predictions --draw-heatmap --det-cat-id=15
+```
+
 To run demos on CPU:
 
 ```shell
diff --git a/demo/docs/2d_face_demo.md b/demo/docs/2d_face_demo.md
index d72d33307b..13ff380b69 100644
--- a/demo/docs/2d_face_demo.md
+++ b/demo/docs/2d_face_demo.md
@@ -14,7 +14,7 @@ For more details, please refer to [face_recognition](https://github.com/ageitgey
 python demo/topdown_face_demo.py \
     ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \
     --input ${INPUT_PATH} [--output-root ${OUTPUT_DIR}] \
-    [--show] [--device ${GPU_ID or CPU}] \
+    [--show] [--device ${GPU_ID or CPU}] [--save-predictions] \
     [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \
     [--kpt-thr ${KPT_SCORE_THR}]
 ```
@@ -46,6 +46,8 @@ python demo/topdown_face_demo.py \
     --draw-heatmap --output-root vis_results
 ```
 
+To save the predicted results on disk, please specify `--save-predictions`.
+
 To run demos on CPU:
 
 ```shell
diff --git a/demo/docs/2d_hand_demo.md b/demo/docs/2d_hand_demo.md
index fe1ba18692..d120108c54 100644
--- a/demo/docs/2d_hand_demo.md
+++ b/demo/docs/2d_hand_demo.md
@@ -11,7 +11,7 @@ python demo/topdown_demo_with_mmdet.py \
     ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \
     ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \
     --input ${INPUT_PATH} [--output-root ${OUTPUT_DIR}] \
-    [--show] [--device ${GPU_ID or CPU}] \
+    [--show] [--device ${GPU_ID or CPU}] [--save-predictions] \
     [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \
     [--kpt-thr ${KPT_SCORE_THR}] [--bbox-thr ${BBOX_SCORE_THR}]
 
@@ -48,6 +48,8 @@ python demo/topdown_demo_with_mmdet.py \
     --output-root vis_results --show --draw-heatmap
 ```
 
+To save the predicted results on disk, please specify `--save-predictions`.
+
 To run demos on CPU:
 
 ```shell
diff --git a/demo/docs/2d_human_pose_demo.md b/demo/docs/2d_human_pose_demo.md
index 60f3c80bce..e57290f2ec 100644
--- a/demo/docs/2d_human_pose_demo.md
+++ b/demo/docs/2d_human_pose_demo.md
@@ -57,9 +57,9 @@ python demo/topdown_demo_with_mmdet.py \
     ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \
     ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \
     --input ${INPUT_PATH} \
-    --output-root ${OUTPUT_DIR} \
-    [--show --draw-heatmap --device ${GPU_ID or CPU}] \
-    [--bbox-thr ${BBOX_SCORE_THR} --kpt-thr ${KPT_SCORE_THR}]
+    [--output-root ${OUTPUT_DIR}] [--save-predictions] \
+    [--show] [--draw-heatmap] [--device ${GPU_ID or CPU}] \
+    [--bbox-thr ${BBOX_SCORE_THR}] [--kpt-thr ${KPT_SCORE_THR}]
 ```
 
 Example:
@@ -78,6 +78,8 @@ Visualization result:
 
 <img src="https://user-images.githubusercontent.com/87690686/187824368-1f1631c3-52bf-4b45-bf9a-a70cd6551e1a.jpg" height="500px" alt><br>
 
+To save the predicted results on disk, please specify `--save-predictions`.
+
 ### 2D Human Pose Top-Down Video Demo
 
 The above demo script can also take video as input, and run mmdet for human detection, and mmpose for pose estimation. The difference is, the `${INPUT_PATH}` for videos can be the local path or **URL** link to video file.
diff --git a/demo/docs/2d_wholebody_pose_demo.md b/demo/docs/2d_wholebody_pose_demo.md
index 8551388172..1615778429 100644
--- a/demo/docs/2d_wholebody_pose_demo.md
+++ b/demo/docs/2d_wholebody_pose_demo.md
@@ -47,9 +47,9 @@ python demo/topdown_demo_with_mmdet.py \
     ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \
     ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \
     --input ${INPUT_PATH} \
-    --output-root ${OUTPUT_DIR} \
-    [--show --draw-heatmap --device ${GPU_ID or CPU}] \
-    [--bbox-thr ${BBOX_SCORE_THR} --kpt-thr ${KPT_SCORE_THR}]
+    [--output-root ${OUTPUT_DIR}] [--save-predictions] \
+    [--show] [--draw-heatmap] [--device ${GPU_ID or CPU}] \
+    [--bbox-thr ${BBOX_SCORE_THR}] [--kpt-thr ${KPT_SCORE_THR}]
 ```
 
 Examples:
@@ -64,6 +64,8 @@ python demo/topdown_demo_with_mmdet.py \
     --output-root vis_results/ --show
 ```
 
+To save the predicted results on disk, please specify `--save-predictions`.
+
 ### 2D Human Whole-Body Pose Top-Down Video Demo
 
 The above demo script can also take video as input, and run mmdet for human detection, and mmpose for pose estimation.
diff --git a/demo/topdown_demo_with_mmdet.py b/demo/topdown_demo_with_mmdet.py
index 2708ea2358..601511e1b0 100644
--- a/demo/topdown_demo_with_mmdet.py
+++ b/demo/topdown_demo_with_mmdet.py
@@ -4,6 +4,7 @@
 import tempfile
 from argparse import ArgumentParser
 
+import json_tricks as json
 import mmcv
 import mmengine
 import numpy as np
@@ -12,7 +13,7 @@
 from mmpose.apis import init_model as init_pose_estimator
 from mmpose.evaluation.functional import nms
 from mmpose.registry import VISUALIZERS
-from mmpose.structures import merge_data_samples
+from mmpose.structures import merge_data_samples, split_instances
 from mmpose.utils import register_all_modules as register_mmpose_modules
 
 try:
@@ -23,9 +24,9 @@
     has_mmdet = False
 
 
-def visualize_img(args, img_path, detector, pose_estimator, visualizer,
-                  show_interval):
-    """Visualize predicted keypoints (and heatmaps) of one image."""
+def infer_and_visualize_image(args, img_path, detector, pose_estimator,
+                              visualizer, show_interval):
+    """Predict the keypoints of one image, and visualize the results."""
 
     # predict bbox
     register_mmdet_modules()
@@ -61,6 +62,8 @@ def visualize_img(args, img_path, detector, pose_estimator, visualizer,
         out_file=out_file,
         kpt_score_thr=args.kpt_thr)
 
+    return data_samples.pred_instances
+
 
 def main():
     """Visualize the demo images.
@@ -85,6 +88,11 @@ def main():
         default='',
         help='root of the output img file. '
         'Default not saving the visualization images.')
+    parser.add_argument(
+        '--save-predictions',
+        action='store_true',
+        default=False,
+        help='whether to save predicted results')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
@@ -132,6 +140,10 @@ def main():
     assert args.det_checkpoint is not None
     if args.output_root:
         mmengine.mkdir_or_exist(args.output_root)
+    if args.save_predictions:
+        assert args.output_root != ''
+        args.pred_save_path = f'{args.output_root}/results_' \
+            f'{os.path.splitext(os.path.basename(args.input))[0]}.json'
 
     # build detector
     register_mmdet_modules()
@@ -157,13 +169,18 @@ def main():
 
     input_type = mimetypes.guess_type(args.input)[0].split('/')[0]
     if input_type == 'image':
-        visualize_img(
+        pred_instances = infer_and_visualize_image(
             args,
             args.input,
             detector,
             pose_estimator,
             visualizer,
             show_interval=0)
+        if args.save_predictions:
+            with open(args.pred_save_path, 'w') as f:
+                json.dump(split_instances(pred_instances), f, indent='\t')
+            print(f'predictions have been saved at {args.pred_save_path}')
+
     elif input_type == 'video':
         tmp_folder = tempfile.TemporaryDirectory()
         video = mmcv.VideoReader(args.input)
@@ -171,8 +188,10 @@ def main():
         video.cvt2frames(tmp_folder.name, show_progress=False)
         output_root = args.output_root
         args.output_root = tmp_folder.name
-        for img_fname in os.listdir(tmp_folder.name):
-            visualize_img(
+        pred_instances_list = []
+
+        for frame_id, img_fname in enumerate(os.listdir(tmp_folder.name)):
+            pred_instances = infer_and_visualize_image(
                 args,
                 f'{tmp_folder.name}/{img_fname}',
                 detector,
@@ -180,6 +199,11 @@ def main():
                 visualizer,
                 show_interval=1)
             progressbar.update()
+            pred_instances_list.append(
+                dict(
+                    frame_id=frame_id,
+                    instances=split_instances(pred_instances)))
+
         if output_root:
             mmcv.frames2video(
                 tmp_folder.name,
@@ -188,6 +212,12 @@ def main():
                 fourcc='mp4v',
                 show_progress=False)
         tmp_folder.cleanup()
+
+        if args.save_predictions:
+            with open(args.pred_save_path, 'w') as f:
+                json.dump(pred_instances_list, f, indent='\t')
+            print(f'predictions have been saved at {args.pred_save_path}')
+
     else:
         raise ValueError(
             f'file {os.path.basename(args.input)} has invalid format.')
diff --git a/demo/topdown_face_demo.py b/demo/topdown_face_demo.py
index 1a170d191e..6eb9905f1e 100644
--- a/demo/topdown_face_demo.py
+++ b/demo/topdown_face_demo.py
@@ -4,6 +4,7 @@
 import tempfile
 from argparse import ArgumentParser
 
+import json_tricks as json
 import mmcv
 import mmengine
 import numpy as np
@@ -12,7 +13,7 @@
 from mmpose.apis import init_model as init_pose_estimator
 from mmpose.evaluation.functional import nms
 from mmpose.registry import VISUALIZERS
-from mmpose.structures import merge_data_samples
+from mmpose.structures import merge_data_samples, split_instances
 from mmpose.utils import register_all_modules as register_mmpose_modules
 
 try:
@@ -38,8 +39,9 @@ def process_face_det_results(face_det_results):
     return person_results
 
 
-def visualize_img(args, img_path, pose_estimator, visualizer, show_interval):
-    """Visualize predicted keypoints (and heatmaps) of one image."""
+def infer_and_visualize_image(args, img_path, pose_estimator, visualizer,
+                              show_interval):
+    """Predict the keypoints of one image, and visualize the results."""
 
     # predict bbox
     image = face_recognition.load_image_file(img_path)
@@ -73,6 +75,8 @@ def visualize_img(args, img_path, pose_estimator, visualizer, show_interval):
         out_file=out_file,
         kpt_score_thr=args.kpt_thr)
 
+    return data_samples.pred_instances
+
 
 def main():
     """Visualize the demo images.
@@ -95,6 +99,11 @@ def main():
         default='',
         help='root of the output img file. '
         'Default not saving the visualization images.')
+    parser.add_argument(
+        '--save-predictions',
+        action='store_true',
+        default=False,
+        help='whether to save predicted results')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
     parser.add_argument(
@@ -132,6 +141,10 @@ def main():
     assert args.input != ''
     if args.output_root:
         mmengine.mkdir_or_exist(args.output_root)
+    if args.save_predictions:
+        assert args.output_root != ''
+        args.pred_save_path = f'{args.output_root}/results_' \
+            f'{os.path.splitext(os.path.basename(args.input))[0]}.json'
 
     # build pose estimator
     register_mmpose_modules()
@@ -153,8 +166,13 @@ def main():
 
     input_type = mimetypes.guess_type(args.input)[0].split('/')[0]
     if input_type == 'image':
-        visualize_img(
+        pred_instances = infer_and_visualize_image(
             args, args.input, pose_estimator, visualizer, show_interval=0)
+        if args.save_predictions:
+            with open(args.pred_save_path, 'w') as f:
+                json.dump(split_instances(pred_instances), f, indent='\t')
+            print(f'predictions have been saved at {args.pred_save_path}')
+
     elif input_type == 'video':
         tmp_folder = tempfile.TemporaryDirectory()
         video = mmcv.VideoReader(args.input)
@@ -162,14 +180,21 @@ def main():
         video.cvt2frames(tmp_folder.name, show_progress=False)
         output_root = args.output_root
         args.output_root = tmp_folder.name
-        for img_fname in os.listdir(tmp_folder.name):
-            visualize_img(
+        pred_instances_list = []
+
+        for frame_id, img_fname in enumerate(os.listdir(tmp_folder.name)):
+            pred_instances = infer_and_visualize_image(
                 args,
                 f'{tmp_folder.name}/{img_fname}',
                 pose_estimator,
                 visualizer,
                 show_interval=1)
             progressbar.update()
+            pred_instances_list.append(
+                dict(
+                    frame_id=frame_id,
+                    instances=split_instances(pred_instances)))
+
         if output_root:
             mmcv.frames2video(
                 tmp_folder.name,
@@ -178,6 +203,12 @@ def main():
                 fourcc='mp4v',
                 show_progress=False)
         tmp_folder.cleanup()
+
+        if args.save_predictions:
+            with open(args.pred_save_path, 'w') as f:
+                json.dump(pred_instances_list, f, indent='\t')
+            print(f'predictions have been saved at {args.pred_save_path}')
+
     else:
         raise ValueError(
             f'file {os.path.basename(args.input)} has invalid format.')
diff --git a/demo/webcam_cfg/pose_estimation.py b/demo/webcam_cfg/pose_estimation.py
index 6f34ce64c6..2246a00779 100644
--- a/demo/webcam_cfg/pose_estimation.py
+++ b/demo/webcam_cfg/pose_estimation.py
@@ -85,9 +85,9 @@
             enable=False,
             input_buffer='vis',
             output_buffer='vis_sunglasses'),
-        # # 'BigeyeEffectNode':
-        # # This node draw the big-eye effetc in the frame image.
-        # # Pose results is needed.
+        # 'BigeyeEffectNode':
+        # This node draw the big-eye effetc in the frame image.
+        # Pose results is needed.
         dict(
             type='BigeyeEffectNode',
             name='big-eye',
diff --git a/mmpose/structures/__init__.py b/mmpose/structures/__init__.py
index 28718c0b99..e4384af1cd 100644
--- a/mmpose/structures/__init__.py
+++ b/mmpose/structures/__init__.py
@@ -5,11 +5,11 @@
 from .keypoint import flip_keypoints
 from .multilevel_pixel_data import MultilevelPixelData
 from .pose_data_sample import PoseDataSample
-from .utils import merge_data_samples, revert_heatmap
+from .utils import merge_data_samples, revert_heatmap, split_instances
 
 __all__ = [
     'PoseDataSample', 'MultilevelPixelData', 'bbox_cs2xywh', 'bbox_cs2xyxy',
     'bbox_xywh2cs', 'bbox_xywh2xyxy', 'bbox_xyxy2cs', 'bbox_xyxy2xywh',
     'flip_bbox', 'get_udp_warp_matrix', 'get_warp_matrix', 'flip_keypoints',
-    'merge_data_samples', 'revert_heatmap'
+    'merge_data_samples', 'revert_heatmap', 'split_instances'
 ]
diff --git a/mmpose/structures/utils.py b/mmpose/structures/utils.py
index f75e3665bf..00399dd1f3 100644
--- a/mmpose/structures/utils.py
+++ b/mmpose/structures/utils.py
@@ -113,3 +113,22 @@ def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape):
         heatmap = heatmap.transpose(2, 0, 1)
 
     return heatmap
+
+
+def split_instances(instances: InstanceData):
+    """Convert instances into a list where each element is a dict that contains
+    information about one instance."""
+    results = []
+
+    for i in range(len(instances.keypoints)):
+        result = dict(
+            keypoints=instances.keypoints[i].tolist(),
+            keypoint_scores=instances.keypoint_scores[i].tolist(),
+        )
+        if 'bboxes' in instances:
+            result['bbox'] = instances.bboxes[i].tolist(),
+            if 'bbox_scores' in instances:
+                result['bbox_score'] = instances.bbox_scores[i]
+        results.append(result)
+
+    return results