diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py index 18232da1a6..d5cd34f8ee 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py index b4ab95a9f7..cbe431a9a5 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_w32_aic_512x512_udp.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py index b47d60d2d5..5178f2e634 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_w32_aic_512x512.py @@ -98,9 +98,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -108,6 +106,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py index c90d963323..96428ceb45 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py index fce8a984e4..eb4a405552 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_512x512_udp.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py index be70c08c8f..9b1d291bdd 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py index a188c49658..126bde9550 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w32_coco_640x640_udp.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py index 9c2556dcea..6f75ab50a4 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py index 21e4e9e676..a0a6ffb3df 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_w48_coco_512x512_udp.py @@ -103,9 +103,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -113,6 +111,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py index af264c5d80..e6e090d391 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py index fcce049a87..a5f70806c7 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_512x512_udp.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py index 3932853586..f3daa0411d 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py index d7bd592ddf..5a320c2f34 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w32_coco_640x640_udp.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py index 2f1d9c2b4c..1ccdf89e78 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py index 8be8badf1a..f13132c114 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_512x512_udp.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py index 3696316448..4fe7b44f05 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py index c1dc7849f9..5d24f6f969 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_w48_coco_640x640_udp.py @@ -99,9 +99,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -109,6 +107,7 @@ with_heatmaps=[True], with_ae=[True], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py index 3f5dc99686..0c14892d25 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco_512x512.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py index 025daa8266..4d4a34007e 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_512x512.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py index 8d0ae90a12..3512603970 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res101_coco_640x640.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py index 4cb0aeb580..e2f8ab7f2d 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_512x512.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py index d75f69e511..b2d43d4e9e 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res152_coco_640x640.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py index 058cac49ef..a510c5aaa4 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_512x512.py @@ -69,9 +69,7 @@ with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0], )), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -79,6 +77,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py index 68e1716705..100635bd07 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/res50_coco_640x640.py @@ -68,9 +68,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py index 4746ae594f..5bffcbc8fd 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py index 7538300d2c..9c54b293a1 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_512x512_udp.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py index ece8ccbb9b..478d739270 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py index 24c33b3d60..eef885cb39 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w32_crowdpose_640x640_udp.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py index 8addf87d6c..a45f73129e 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py index 0ee83bba23..d31c636d60 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_w48_crowdpose_512x512_udp.py @@ -102,9 +102,7 @@ pull_loss_factor=[0.001, 0.001], with_heatmaps_loss=[True, True], heatmaps_loss_factor=[1.0, 1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -112,6 +110,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=False, + align_corners=True, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py index 2f035e99e6..b4037a6871 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/mobilenetv2_crowdpose_512x512.py @@ -67,9 +67,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -77,6 +75,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py index 8fb3959670..65c32641fa 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res101_crowdpose_512x512.py @@ -67,9 +67,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -77,6 +75,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py index 45d740e699..0680370f20 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res152_crowdpose_512x512.py @@ -67,9 +67,7 @@ pull_loss_factor=[0.001], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -77,6 +75,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py index aa161e3fe2..bece913c60 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/res50_crowdpose_512x512.py @@ -68,9 +68,7 @@ with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0], )), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -78,6 +76,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py index 27ce967f17..5296a3136f 100644 --- a/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py +++ b/configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_w48_mhp_512x512.py @@ -97,9 +97,7 @@ pull_loss_factor=[0.01], with_heatmaps_loss=[True], heatmaps_loss_factor=[1.0])), - train_cfg=dict( - num_joints=channel_cfg['dataset_joints'], - img_size=data_cfg['image_size']), + train_cfg=dict(), test_cfg=dict( num_joints=channel_cfg['dataset_joints'], max_num_people=30, @@ -107,6 +105,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/hrnet_w32_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/hrnet_w32_coco_512x512.py new file mode 100644 index 0000000000..24d6b7d1b8 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/hrnet_w32_coco_512x512.py @@ -0,0 +1,213 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + with_bg=True, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=17, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict(type='MaskedMSELoss', )), + ], + paf_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=38, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict(type='MaskedMSELoss', )), + ], + heatmap_index=[0], + paf_index=[0], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0, 1], + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368.py new file mode 100644 index 0000000000..c391cb7322 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368.py @@ -0,0 +1,179 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=17, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck.py new file mode 100644 index 0000000000..af29f3d682 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck.py @@ -0,0 +1,183 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd.py new file mode 100644 index 0000000000..32c2800b07 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd.py @@ -0,0 +1,177 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile', channel_order='bgr'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict(type='NormalizeTensor', mean=[0.5, 0.5, 0.5], std=[1., 1., 1.]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile', channel_order='bgr'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', mean=[0.5, 0.5, 0.5], std=[1., 1., + 1.]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd_lr015.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd_lr015.py new file mode 100644 index 0000000000..eb4c160478 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_bgr_meanstd_lr015.py @@ -0,0 +1,177 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile', channel_order='bgr'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict(type='NormalizeTensor', mean=[0.5, 0.5, 0.5], std=[1., 1., 1.]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile', channel_order='bgr'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', mean=[0.5, 0.5, 0.5], std=[1., 1., + 1.]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr00015.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr00015.py new file mode 100644 index 0000000000..7e91e6f004 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr00015.py @@ -0,0 +1,182 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.00015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + gamma=0.333, + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr015.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr015.py new file mode 100644 index 0000000000..00f11dd14e --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/lightweight_openpose_coco_368x368_withbg_addneck_lr015.py @@ -0,0 +1,181 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True, +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/mobilenetv1_sgd_68.848.pth', + backbone=dict( + type='LightweightOpenPoseNetwork', + in_channels=3, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 2, + heatmap_index=[0, 1], + paf_index=[2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 2 + [1] * 2, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_sn_coco_368x368.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_sn_coco_368x368.py new file mode 100644 index 0000000000..3bf0c84b69 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_sn_coco_368x368.py @@ -0,0 +1,182 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + out_channels_cm=17, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=5, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=(8, 8, 8, 8, 5), + block_channels=(128, 128, 128, 256, 256)), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 4, + heatmap_index=[4], + paf_index=[0, 1, 2, 3], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 4, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368.py new file mode 100644 index 0000000000..7e7156e8ca --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368.py @@ -0,0 +1,179 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV1', + in_channels=3, + out_channels_cm=17, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + heatmap_index=[0, 1, 2, 3, 4, 5], + paf_index=[6, 7, 8, 9, 10, 11], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 6 + [1] * 6, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368_withbg.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368_withbg.py new file mode 100644 index 0000000000..5d10a5b72b --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v1_coco_368x368_withbg.py @@ -0,0 +1,180 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV1', + in_channels=3, + # additional channel for bg + out_channels_cm=17 + 1, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + heatmap_index=[0, 1, 2, 3, 4, 5], + paf_index=[6, 7, 8, 9, 10, 11], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 6 + [1] * 6, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368.py new file mode 100644 index 0000000000..fcda6a04f5 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368.py @@ -0,0 +1,182 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + out_channels_cm=17, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg.py new file mode 100644 index 0000000000..0b1020d6bd --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg.py @@ -0,0 +1,183 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + # additional channel for bg + out_channels_cm=17 + 1, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_addneck.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_addneck.py new file mode 100644 index 0000000000..aa3f470da0 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_addneck.py @@ -0,0 +1,187 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True, + add_neck=True +) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + # additional channel for bg + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'], + add_neck=data_cfg['add_neck'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + add_neck=data_cfg['add_neck'], + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_sgd.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_sgd.py new file mode 100644 index 0000000000..a33e52098d --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/openpose_v2_coco_368x368_withbg_sgd.py @@ -0,0 +1,183 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='SGD', + lr=0.01, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=368, + base_size=256, + base_sigma=2, + heatmap_size=[46], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg19_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + # additional channel for bg + out_channels_cm=17 + 1, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/res50_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/res50_coco_512x512.py new file mode 100644 index 0000000000..239901be51 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/part_affinity_field/coco/res50_coco_512x512.py @@ -0,0 +1,181 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='torchvision://resnet50', + backbone=dict(type='ResNet', depth=50), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=2048, + out_channels=17, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict(type='MaskedMSELoss', )), + ], + paf_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=2048, + out_channels=38, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict(type='MaskedMSELoss', )), + ], + heatmap_index=[0], + paf_index=[0], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0, 1], + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py index db89340076..c4060f7c4d 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_512x512.py @@ -113,6 +113,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py index 4483e6eb81..09c1562b5a 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w32_coco_wholebody_640x640.py @@ -113,6 +113,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py index b05b0ad812..0ecbe2f070 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_512x512.py @@ -113,6 +113,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py index 38a6938ab3..b406b94b86 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_w48_coco_wholebody_640x640.py @@ -113,6 +113,7 @@ with_heatmaps=[True, True], with_ae=[True, False], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py index 06ff2edf81..2cee3f056a 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_512x512.py @@ -109,6 +109,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py index 595ecb0052..40df05bc05 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w32_coco_wholebody_640x640.py @@ -109,6 +109,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py index a9a567b48f..d62412a6fe 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_512x512.py @@ -109,6 +109,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py index c320d625b2..2b0f7cfb92 100644 --- a/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py +++ b/configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_w48_coco_wholebody_640x640.py @@ -109,6 +109,7 @@ with_heatmaps=[True], with_ae=[True], project2image=True, + align_corners=False, nms_kernel=5, nms_padding=2, tag_per_joint=True, diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480.py new file mode 100644 index 0000000000..80d3768834 --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480.py @@ -0,0 +1,210 @@ +log_level = 'INFO' +load_from = None +resume_from = 'work_dirs/hrnet_w32_coco_wholebody_480x480/epoch_150.pth' +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[120], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=133, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=True)), + ], + paf_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=270, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=True)), + ], + heatmap_index=[0], + paf_index=[0], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0, 1], + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480_withbg.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480_withbg.py new file mode 100644 index 0000000000..2aa4f9ed0e --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/hrnet_w32_coco_wholebody_480x480_withbg.py @@ -0,0 +1,210 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + gamma=0.333, + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[120], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/hrnet_w32_coco_512x512/best.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=134, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ], + paf_heads_cfg=[ + dict( + type='DeconvHead', + in_channels=32, + out_channels=270, + num_deconv_layers=0, + extra=dict( + final_conv_kernel=1, + num_conv_layers=1, + num_conv_kernels=(1, )), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ], + heatmap_index=[0], + paf_index=[0], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0, 1], + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480.py new file mode 100644 index 0000000000..cd34eb9f3a --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480.py @@ -0,0 +1,177 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[60], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg16_bn', + backbone=dict( + type='OpenPoseNetworkV1', + in_channels=3, + out_channels_cm=133, + out_channels_paf=270, + stem_feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict(type='MaskedMSELoss', )), + ] * 6, + heatmap_index=[0, 1, 2, 3, 4, 5], + paf_index=[6, 7, 8, 9, 10, 11], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 6 + [1] * 6, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480_withbg.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480_withbg.py new file mode 100644 index 0000000000..2fd5fc4d28 --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v1_coco_wholebody_480x480_withbg.py @@ -0,0 +1,180 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.00015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + gamma=0.333, + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[60], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/openpose_v1_coco_368x368/best.pth', + backbone=dict( + type='OpenPoseNetworkV1', + in_channels=3, + out_channels_cm=134, + out_channels_paf=270, + stem_feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ] * 6, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ] * 6, + heatmap_index=[0, 1, 2, 3, 4, 5], + paf_index=[6, 7, 8, 9, 10, 11], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] * 6 + [1] * 6, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480.py new file mode 100644 index 0000000000..f283c2fbfc --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480.py @@ -0,0 +1,181 @@ +log_level = 'INFO' +load_from = None +resume_from = 'work_dirs/openpose_v2_coco_wholebody_480x480/epoch_150.pth' +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[60], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=False) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='mmcls://vgg16_bn', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + out_channels_cm=133, + out_channels_paf=270, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=True)), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=True)), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5], + max_input_size=1800), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480_withbg.py b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480_withbg.py new file mode 100644 index 0000000000..1ea565cb54 --- /dev/null +++ b/configs/wholebody/2d_kpt_sview_rgb_img/part_affinity_field/coco-wholebody/openpose_v2_coco_wholebody_480x480_withbg.py @@ -0,0 +1,183 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=50) +evaluation = dict(interval=50, metric='mAP', key_indicator='AP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + gamma=0.333, + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +channel_cfg = dict( + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + +data_cfg = dict( + image_size=480, + base_size=256, + base_sigma=2, + heatmap_size=[60], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, + with_bg=True) + +# model settings +model = dict( + type='PartAffinityField', + pretrained='models/openpose_v2_coco_368x368/best.pth', + backbone=dict( + type='OpenPoseNetworkV2', + in_channels=3, + # additional channel for bg + out_channels_cm=134, + out_channels_paf=270, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96), + keypoint_head=dict( + type='PAFHead', + heatmap_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ] * 1, + paf_heads_cfg=[ + dict( + type='DeconvHead', + num_deconv_layers=0, + extra=dict(final_conv_kernel=0), + loss_keypoint=dict( + type='MaskedMSELoss', supervise_empty=False)), + ] * 5, + heatmap_index=[5], + paf_index=[0, 1, 2, 3, 4], + ), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[0.5, 1, 1.5, 2, 2.5], + with_heatmaps=[True], + with_pafs=[True], + project2image=True, + align_corners=False, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True, + with_bg=data_cfg['with_bg'])) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=2, + with_bg=data_cfg['with_bg']) + ], + [dict( + type='BottomUpGeneratePAFTarget', + limb_width=1, + )], + ], + pipeline_indices=[0] + [1] * 5, + keys=['targets', 'masks']), + dict(type='Collect', keys=['img', 'targets', 'masks'], meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[0.5, 1, 1.5, 2, 2.5], + max_input_size=1800), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'skeleton' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + train=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/mmpose/.mim/configs b/mmpose/.mim/configs new file mode 120000 index 0000000000..5992d109cb --- /dev/null +++ b/mmpose/.mim/configs @@ -0,0 +1 @@ +../../configs \ No newline at end of file diff --git a/mmpose/.mim/demo b/mmpose/.mim/demo new file mode 120000 index 0000000000..bf71256cd3 --- /dev/null +++ b/mmpose/.mim/demo @@ -0,0 +1 @@ +../../demo \ No newline at end of file diff --git a/mmpose/.mim/model-index.yml b/mmpose/.mim/model-index.yml new file mode 120000 index 0000000000..a18c0b389b --- /dev/null +++ b/mmpose/.mim/model-index.yml @@ -0,0 +1 @@ +../../model-index.yml \ No newline at end of file diff --git a/mmpose/.mim/tools b/mmpose/.mim/tools new file mode 120000 index 0000000000..31941e941d --- /dev/null +++ b/mmpose/.mim/tools @@ -0,0 +1 @@ +../../tools \ No newline at end of file diff --git a/mmpose/apis/inference.py b/mmpose/apis/inference.py index bcb6c73a61..0cd989c18e 100644 --- a/mmpose/apis/inference.py +++ b/mmpose/apis/inference.py @@ -460,18 +460,40 @@ def inference_bottom_up_pose_model(model, test_pipeline = Compose(test_pipeline) # prepare data - data = { - 'img_or_path': img_or_path, - 'dataset': 'coco', - 'ann_info': { - 'image_size': - cfg.data_cfg['image_size'], - 'num_joints': - cfg.data_cfg['num_joints'], - 'flip_index': - [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], + if cfg.data_cfg.get('add_neck', False): + data = { + 'img_or_path': img_or_path, + 'dataset': 'coco', + 'ann_info': { + 'image_size': + cfg.data_cfg['image_size'], + 'num_joints': + cfg.data_cfg['num_joints'], + 'flip_index': + [0, 1, 5, 6, 7, 2, 3, 4, 11, 12, 13, 8, 9, 10, 15, 14, 17, 16], + 'skeleton': [[1, 8], [8, 9], [9, 10], [1, 11], [11, 12], + [12, 13], [1, 2], [2, 3], [3, 4], [2, 16], [1, 5], + [5, 6], [6, 7], [5, 17], [1, 0], [0, 14], [0, 15], + [14, 16], [15, 17]] + } + } + else: + data = { + 'img_or_path': img_or_path, + 'dataset': 'coco', + 'ann_info': { + 'image_size': + cfg.data_cfg['image_size'], + 'num_joints': + cfg.data_cfg['num_joints'], + 'flip_index': + [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], + 'skeleton': [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [4, 6]] + } } - } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) @@ -496,7 +518,10 @@ def inference_bottom_up_pose_model(model, returned_outputs.append(h.layer_outputs) + order_map = [0, 15, 14, 17, 16, 5, 2, 6, 3, 7, 4, 11, 8, 12, 9, 13, 10] for idx, pred in enumerate(result['preds']): + if cfg.data_cfg['add_neck']: + pred = pred[order_map] area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * ( np.max(pred[:, 1]) - np.min(pred[:, 1])) pose_results.append({ diff --git a/mmpose/core/evaluation/__init__.py b/mmpose/core/evaluation/__init__.py index f22953ab84..2c2984f313 100644 --- a/mmpose/core/evaluation/__init__.py +++ b/mmpose/core/evaluation/__init__.py @@ -1,5 +1,6 @@ -from .bottom_up_eval import (aggregate_results, get_group_preds, - get_multi_stage_outputs) +from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, flip_part_affinity_fields, + get_group_preds, split_ae_outputs) from .eval_hooks import DistEvalHook, EvalHook from .mesh_eval import compute_similarity_transform from .pose3d_eval import keypoint_mpjpe @@ -12,8 +13,8 @@ __all__ = [ 'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps', 'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_auc', - 'keypoint_epe', 'get_group_preds', 'get_multi_stage_outputs', - 'aggregate_results', 'compute_similarity_transform', 'post_dark_udp', - 'keypoint_mpjpe', 'keypoints_from_heatmaps3d', - 'multilabel_classification_accuracy' + 'keypoint_epe', 'get_group_preds', 'split_ae_outputs', 'flip_feature_maps', + 'aggregate_stage_flip', 'aggregate_scale', 'compute_similarity_transform', + 'post_dark_udp', 'keypoint_mpjpe', 'keypoints_from_heatmaps3d', + 'multilabel_classification_accuracy', 'flip_part_affinity_fields' ] diff --git a/mmpose/core/evaluation/bottom_up_eval.py b/mmpose/core/evaluation/bottom_up_eval.py index 5d90027044..98b6daaace 100644 --- a/mmpose/core/evaluation/bottom_up_eval.py +++ b/mmpose/core/evaluation/bottom_up_eval.py @@ -5,130 +5,285 @@ warp_affine_joints) -def get_multi_stage_outputs(outputs, - outputs_flip, - num_joints, - with_heatmaps, - with_ae, - tag_per_joint=True, - flip_index=None, - project2image=True, - size_projected=None, - align_corners=False): - """Inference the model to get multi-stage outputs (heatmaps & tags), and - resize them to base sizes. +def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae): + """Split multi-stage outputs into heatmaps & tags. Args: outputs (list(torch.Tensor)): Outputs of network - outputs_flip (list(torch.Tensor)): Flip outputs of network num_joints (int): Number of joints with_heatmaps (list[bool]): Option to output heatmaps for different stages. with_ae (list[bool]): Option to output ae tags for different stages. - tag_per_joint (bool): Option to use one tag map per joint. - flip_index (list[int]): Keypoint flip index. - project2image (bool): Option to resize to base scale. - size_projected ([w, h]): Base size of heatmaps. - align_corners (bool): Align corners when performing interpolation. - Returns: tuple: A tuple containing multi-stage outputs. - - - outputs (list(torch.Tensor)): List of simple outputs and - flip outputs. - - heatmaps (torch.Tensor): Multi-stage heatmaps that are resized to - the base size. - - tags (torch.Tensor): Multi-stage tags that are resized to - the base size. + - heatmaps (list(torch.Tensor)): multi-stage heatmaps. + - tags (list(torch.Tensor)): multi-stage tags. """ - heatmaps_avg = 0 - num_heatmaps = 0 heatmaps = [] tags = [] - flip_test = outputs_flip is not None - # aggregate heatmaps from different stages for i, output in enumerate(outputs): - if i != len(outputs) - 1: - output = torch.nn.functional.interpolate( - output, - size=(outputs[-1].size(2), outputs[-1].size(3)), - mode='bilinear', - align_corners=align_corners) - # staring index of the associative embeddings offset_feat = num_joints if with_heatmaps[i] else 0 - if with_heatmaps[i]: - heatmaps_avg += output[:, :num_joints] - num_heatmaps += 1 - + heatmaps.append(output[:, :num_joints]) if with_ae[i]: tags.append(output[:, offset_feat:]) + return heatmaps, tags + + +def flip_feature_maps(feature_maps, flip_index=None): + """Flip the feature maps and swap the channels. + + Args: + feature_maps (list(torch.Tensor)): Feature maps. + flip_index (list(int) | None): Channel-flip indexes. If None, + do not flip channels. + Returns: + flipped_feature_maps (list(torch.Tensor)): Flipped feature_maps. + """ + flipped_feature_maps = [] + for feature_map in feature_maps: + feature_map = torch.flip(feature_map, [3]) + if flip_index is not None: + flipped_feature_maps.append(feature_map[:, flip_index, :, :]) + else: + flipped_feature_maps.append(feature_map) - if num_heatmaps > 0: - heatmaps.append(heatmaps_avg / num_heatmaps) + return flipped_feature_maps - if flip_test and flip_index: - # perform flip testing - heatmaps_avg = 0 - num_heatmaps = 0 - for i, output in enumerate(outputs_flip): - if i != len(outputs_flip) - 1: - output = torch.nn.functional.interpolate( - output, - size=(outputs_flip[-1].size(2), outputs_flip[-1].size(3)), - mode='bilinear', - align_corners=align_corners) - output = torch.flip(output, [3]) - outputs.append(output) +def flip_part_affinity_fields(pafs, flip_index, skeleton): + """Flip the part affinity fields and swap the channels. - offset_feat = num_joints if with_heatmaps[i] else 0 + Args: + pafs (list(torch.Tensor)): Part-affinity fields. + flip_index (list(int) | None): Channel-flip indexes. If None, + do not flip channels. + skeleton (list(list(int, int))): Pairs of linked keypoints. + Returns: + flipped_pafs (list(torch.Tensor)): Flipped pafs. + """ + flipped_skeleton = [[flip_index[a], flip_index[b]] for a, b in skeleton] + + flip_index_paf = [] + flip_x_index = [] + for sk_id, sk in enumerate(flipped_skeleton): + try: + # found flip-pairs + ind = skeleton.index(sk) + flip_x_index.append(sk_id * 2) + except ValueError: + try: + # unidirectional edge + ind = skeleton.index([sk[1], sk[0]]) + except ValueError: + raise ValueError('The skeleton should be symmetric.') + + flip_index_paf.append(2 * ind) + flip_index_paf.append(2 * ind + 1) + + flipped_pafs = [] + for paf in pafs: + paf = torch.flip(paf, [3]) + # flip the x-axis direction + paf[:, flip_x_index, :, :] *= -1 + # flip channels + if flip_index is not None: + flipped_pafs.append(paf[:, flip_index_paf, :, :]) + else: + flipped_pafs.append(paf) + + return flipped_pafs + + +def _resize_average(feature_maps, align_corners, index=-1, resize_size=None): + """Resize the feature maps and compute the average. + + Args: + feature_maps (list(torch.Tensor)): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size ([w, h]): The target size. + Returns: + feature_maps_avg (list(torch.Tensor)): Averaged feature_maps. + """ - if with_heatmaps[i]: - heatmaps_avg += output[:, :num_joints][:, flip_index, :, :] - num_heatmaps += 1 + if feature_maps is None: + return None + feature_maps_avg = 0 - if with_ae[i]: - tags.append(output[:, offset_feat:]) - if tag_per_joint: - tags[-1] = tags[-1][:, flip_index, :, :] + feature_map_list = _resize_concate( + feature_maps, align_corners, index=index, resize_size=resize_size) + for feature_map in feature_map_list: + feature_maps_avg += feature_map - heatmaps.append(heatmaps_avg / num_heatmaps) + feature_maps_avg /= len(feature_map_list) + return [feature_maps_avg] - if project2image and size_projected: - heatmaps = [ - torch.nn.functional.interpolate( - hms, - size=(size_projected[1], size_projected[0]), - mode='bilinear', - align_corners=align_corners) for hms in heatmaps - ] - tags = [ - torch.nn.functional.interpolate( - tms, - size=(size_projected[1], size_projected[0]), +def _resize_unsqueeze_concat(feature_maps, + align_corners, + index=-1, + resize_size=None): + """Resize, unsqueeze and concatenate the feature_maps. + + Args: + feature_maps (list(torch.Tensor)): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size ([w, h]): The target size. + Returns: + output_feature_maps (list(torch.Tensor)): Averaged feature_maps. + """ + if feature_maps is None: + return None + feature_map_list = _resize_concate( + feature_maps, align_corners, index=index, resize_size=resize_size) + + feat_dim = len(feature_map_list[0].shape) - 1 + output_feature_maps = torch.cat( + [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list], + dim=feat_dim + 1) + return [output_feature_maps] + + +def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None): + """Resize and concatenate the feature_maps. + + Args: + feature_maps (list(torch.Tensor)): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size ([w, h]): The target size. + Returns: + feature_map_list (list(torch.Tensor)): Averaged feature_maps. + """ + if feature_maps is None: + return None + + feature_map_list = [] + + if index < 0: + index += len(feature_maps) + + if resize_size is None: + resize_size = (feature_maps[index].size(2), + feature_maps[index].size(3)) + + for feature_map in feature_maps: + ori_size = (feature_map.size(2), feature_map.size(3)) + if ori_size != resize_size: + feature_map = torch.nn.functional.interpolate( + feature_map, + size=resize_size, mode='bilinear', - align_corners=align_corners) for tms in tags - ] + align_corners=align_corners) - return outputs, heatmaps, tags + feature_map_list.append(feature_map) + return feature_map_list -def aggregate_results(scale, - aggregated_heatmaps, - tags_list, - heatmaps, - tags, - test_scale_factor, - project2image, - flip_test, - align_corners=False): + +def aggregate_stage_flip(feature_maps, + feature_maps_flip, + index=-1, + project2image=True, + size_projected=None, + align_corners=False, + aggregate_stage='concat', + aggregate_flip='average'): + """Inference the model to get multi-stage outputs (heatmaps & tags), and + resize them to base sizes. + + Args: + feature_maps (list(torch.Tensor)): feature_maps can be heatmaps, + tags, and pafs. + feature_maps_flip (list(torch.Tensor) | None): flipped feature_maps. + feature maps can be heatmaps, tags, and pafs. + project2image (bool): Option to resize to base scale. + size_projected ([w, h]): Base size of heatmaps. + align_corners (bool): Align corners when performing interpolation. + aggregate_stage (str): Methods to aggregate multi-stage feature maps. + Options: 'concat', 'average'. + 'concat': Concatenate the original and the flipped feature maps. + 'average': Get the average of the original and the flipped + feature maps.. + Default: 'concat. + aggregate_flip (str): Methods to aggregate the original and + the flipped feature maps. + Options: 'concat', 'average', 'none'. + 'concat': Concatenate the original and the flipped feature maps. + 'average': Get the average of the original and the flipped + feature maps.. + 'none': no flipped feature maps. + Default: 'average. + + Returns: + - output_feature_maps (List(torch.Tensor[NxKxWxH])): + Aggregated feature maps. + """ + + if feature_maps_flip is None: + aggregate_flip = 'none' + + output_feature_maps = [] + + if aggregate_stage == 'average': + _aggregate_stage_func = _resize_average + elif aggregate_stage == 'concat': + _aggregate_stage_func = _resize_concate + else: + NotImplementedError() + + if project2image and size_projected: + _origin = _aggregate_stage_func( + feature_maps, + align_corners, + index=index, + resize_size=(size_projected[1], size_projected[0])) + + _flipped = _aggregate_stage_func( + feature_maps_flip, + align_corners, + index=index, + resize_size=(size_projected[1], size_projected[0])) + else: + _origin = _aggregate_stage_func( + feature_maps, align_corners, index=index, resize_size=None) + _flipped = _aggregate_stage_func( + feature_maps_flip, align_corners, index=index, resize_size=None) + + if aggregate_flip == 'average': + assert feature_maps_flip is not None + for _ori, _fli in zip(_origin, _flipped): + output_feature_maps.append((_ori + _fli) / 2.0) + + elif aggregate_flip == 'concat': + assert feature_maps_flip is not None + output_feature_maps.append(*_origin) + output_feature_maps.append(*_flipped) + + elif aggregate_flip == 'none': + if isinstance(_origin, list): + output_feature_maps.append(*_origin) + else: + output_feature_maps.append(_origin) + else: + NotImplementedError() + + return output_feature_maps + + +def aggregate_scale(feature_maps_list, + align_corners=False, + aggregate_scale='average'): """Aggregate multi-scale outputs. Note: @@ -138,50 +293,30 @@ def aggregate_results(scale, heatmap height: H Args: - scale (int): current scale - aggregated_heatmaps (torch.Tensor | None): Aggregated heatmaps. - tags_list (list(torch.Tensor)): Tags list of previous scale. - heatmaps (List(torch.Tensor[NxKxWxH])): A batch of heatmaps. - tags (List(torch.Tensor[NxKxWxH])): A batch of tag maps. - test_scale_factor (List(int)): Multi-scale factor for testing. + feature_maps_list (list(torch.Tensor)): Aggregated feature maps. project2image (bool): Option to resize to base scale. - flip_test (bool): Option to use flip test. align_corners (bool): Align corners when performing interpolation. + aggregate_scale (str): Methods to aggregate multi-scale feature maps. + Options: 'average', 'unsqueeze_concat'. + 'average': Get the average of the feature maps. + 'unsqueeze_concat': Concatenate the feature maps along new axis. + Default: 'average. Return: - tuple: a tuple containing aggregated results. - - - aggregated_heatmaps (torch.Tensor): Heatmaps with multi scale. - - tags_list (list(torch.Tensor)): Tag list of multi scale. + - output_feature_maps (torch.Tensor): Aggregated feature maps. """ - if scale == 1 or len(test_scale_factor) == 1: - if aggregated_heatmaps is not None and not project2image: - tags = [ - torch.nn.functional.interpolate( - tms, - size=(aggregated_heatmaps.size(2), - aggregated_heatmaps.size(3)), - mode='bilinear', - align_corners=align_corners) for tms in tags - ] - for tms in tags: - tags_list.append(torch.unsqueeze(tms, dim=4)) - - heatmaps_avg = (heatmaps[0] + - heatmaps[1]) / 2.0 if flip_test else heatmaps[0] - - if aggregated_heatmaps is None: - aggregated_heatmaps = heatmaps_avg - elif project2image: - aggregated_heatmaps += heatmaps_avg + + if aggregate_scale == 'average': + output_feature_maps = _resize_average( + feature_maps_list, align_corners, index=0, resize_size=None) + + elif aggregate_scale == 'unsqueeze_concat': + output_feature_maps = _resize_unsqueeze_concat( + feature_maps_list, align_corners, index=0, resize_size=None) else: - aggregated_heatmaps += torch.nn.functional.interpolate( - heatmaps_avg, - size=(aggregated_heatmaps.size(2), aggregated_heatmaps.size(3)), - mode='bilinear', - align_corners=align_corners) + NotImplementedError() - return aggregated_heatmaps, tags_list + return output_feature_maps[0] def get_group_preds(grouped_joints, @@ -204,6 +339,9 @@ def get_group_preds(grouped_joints, Returns: list: List of the pose result for each person. """ + if len(grouped_joints) == 0: + return [] + if use_udp: if grouped_joints[0].shape[0] > 0: heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0 diff --git a/mmpose/core/post_processing/group.py b/mmpose/core/post_processing/group.py index 9a870d6ce3..2d443205c8 100644 --- a/mmpose/core/post_processing/group.py +++ b/mmpose/core/post_processing/group.py @@ -1,7 +1,4 @@ -# ------------------------------------------------------------------------------ -# Adapted from https://github.com/princeton-vl/pose-ae-train/ -# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License. -# ------------------------------------------------------------------------------ +from abc import ABCMeta, abstractmethod import numpy as np import torch @@ -117,19 +114,20 @@ def _match_by_tag(inp, params): class _Params: - """A class of parameter. + """A class of parameters for keypoint grouping. Args: cfg(Config): config. """ def __init__(self, cfg): - self.num_joints = cfg['num_joints'] + self.add_neck = cfg.get('add_neck', False) + if self.add_neck: + self.num_joints = cfg['num_joints'] + 1 + else: + self.num_joints = cfg['num_joints'] self.max_num_people = cfg['max_num_people'] - self.detection_threshold = cfg['detection_threshold'] - self.tag_threshold = cfg['tag_threshold'] - self.use_detection_val = cfg['use_detection_val'] self.ignore_too_much = cfg['ignore_too_much'] if self.num_joints == 17: @@ -141,12 +139,11 @@ def __init__(self, cfg): self.joint_order = list(np.arange(self.num_joints)) -class HeatmapParser: - """The heatmap parser for post processing.""" +class BaseBottomUpParser(metaclass=ABCMeta): + """The base bottom-up parser for post processing.""" def __init__(self, cfg): self.params = _Params(cfg) - self.tag_per_joint = cfg['tag_per_joint'] self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1, cfg['nms_padding']) self.use_udp = cfg.get('use_udp', False) @@ -167,28 +164,9 @@ def nms(self, heatmaps): return heatmaps - def match(self, tag_k, loc_k, val_k): - """Group keypoints to human poses in a batch. - - Args: - tag_k (np.ndarray[NxKxMxL]): tag corresponding to the - top k values of feature map per keypoint. - loc_k (np.ndarray[NxKxMx2]): top k locations of the - feature maps for keypoint. - val_k (np.ndarray[NxKxM]): top k value of the - feature maps per keypoint. - - Returns: - list - """ - - def _match(x): - return _match_by_tag(x, self.params) - - return list(map(_match, zip(tag_k, loc_k, val_k))) - - def top_k(self, heatmaps, tags): - """Find top_k values in an image. + @staticmethod + def top_k_value(feature_maps, M): + """Find top_k values in the feature_maps. Note: batch size: N @@ -196,86 +174,21 @@ def top_k(self, heatmaps, tags): heatmap height: H heatmap width: W max number of people: M - dim of tags: L - If use flip testing, L=2; else L=1. Args: - heatmaps (torch.Tensor[NxKxHxW]) - tags (torch.Tensor[NxKxHxWxL]) + feature_maps (torch.Tensor[NxKxHxW]) Return: - dict: A dict containing top_k values. - - - tag_k (np.ndarray[NxKxMxL]): - tag corresponding to the top k values of - feature map per keypoint. - - loc_k (np.ndarray[NxKxMx2]): - top k location of feature map per keypoint. - - val_k (np.ndarray[NxKxM]): + - val_k (torch.Tensor[NxKxM]): top k value of feature map per keypoint. + - ind_k (torch.Tensor[NxKxM]): + index of the selected locations. """ - heatmaps = self.nms(heatmaps) - N, K, H, W = heatmaps.size() - heatmaps = heatmaps.view(N, K, -1) - val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2) - - tags = tags.view(tags.size(0), tags.size(1), W * H, -1) - if not self.tag_per_joint: - tags = tags.expand(-1, self.params.num_joints, -1, -1) - - tag_k = torch.stack( - [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))], - dim=3) - - x = ind % W - y = ind // W - - ind_k = torch.stack((x, y), dim=3) - - ans = { - 'tag_k': tag_k.cpu().numpy(), - 'loc_k': ind_k.cpu().numpy(), - 'val_k': val_k.cpu().numpy() - } - - return ans - - @staticmethod - def adjust(ans, heatmaps): - """Adjust the coordinates for better accuracy. + N, K, H, W = feature_maps.size() + feature_maps = feature_maps.view(N, K, -1) + val_k, ind_k = feature_maps.topk(M, dim=2) - Note: - batch size: N - number of keypoints: K - heatmap height: H - heatmap width: W - - Args: - ans (list(np.ndarray)): Keypoint predictions. - heatmaps (torch.Tensor[NxKxHxW]): Heatmaps. - """ - _, _, H, W = heatmaps.shape - for batch_id, people in enumerate(ans): - for people_id, people_i in enumerate(people): - for joint_id, joint in enumerate(people_i): - if joint[2] > 0: - x, y = joint[0:2] - xx, yy = int(x), int(y) - tmp = heatmaps[batch_id][joint_id] - if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1), - xx]: - y += 0.25 - else: - y -= 0.25 - - if tmp[yy, min(W - 1, xx + 1)] > tmp[yy, - max(0, xx - 1)]: - x += 0.25 - else: - x -= 0.25 - ans[batch_id][people_id, joint_id, - 0:2] = (x + 0.5, y + 0.5) - return ans + return val_k, ind_k @staticmethod def refine(heatmap, tag, keypoints, use_udp=False): @@ -356,7 +269,170 @@ def refine(heatmap, tag, keypoints, use_udp=False): return keypoints - def parse(self, heatmaps, tags, adjust=True, refine=True): + @staticmethod + def adjust(ans, heatmaps, use_udp=False): + """Adjust the coordinates for better accuracy. + + Note: + batch size: N + number of person: M + number of keypoints: K + heatmap height: H + heatmap width: W + + Args: + ans (list(np.array([M,K,3+]))): Keypoint predictions. + heatmaps (torch.Tensor[NxKxHxW]): Heatmaps. + """ + + if use_udp: + for i in range(len(ans)): + if ans[i].shape[0] > 0: + ans[i][..., :2] = post_dark_udp(ans[i][..., :2].copy(), + heatmaps[i:i + 1, :]) + else: + _, _, H, W = heatmaps.shape + for batch_id, people in enumerate(ans): + for people_id, people_i in enumerate(people): + for joint_id, joint in enumerate(people_i): + if joint[2] > 0: + x, y = joint[0:2] + xx, yy = int(x), int(y) + tmp = heatmaps[batch_id][joint_id] + if tmp[min(H - 1, yy + 1), + xx] > tmp[max(0, yy - 1), xx]: + y += 0.25 + else: + y -= 0.25 + + if tmp[yy, min(W - 1, xx + + 1)] > tmp[yy, max(0, xx - 1)]: + x += 0.25 + else: + x -= 0.25 + ans[batch_id][people_id, joint_id, + 0:2] = (x + 0.5, y + 0.5) + return ans + + def filter_pose(self, ans, kpt_num_thr=3, mean_score_thr=0.2): + """Filter out the poses with #keypoints < kpt_num_thr, and those with + keypoint score < mean_score_thr. + + Note: + number of person: M + number of keypoints: K + + Args: + filtered_ans (list(np.array([M,K,3+]))): Keypoint predictions. + """ + filtered_ans = [] + for i in range(len(ans[0])): + score = ans[0][i, :, 2] + if sum(score > 0) < kpt_num_thr or (score[score > 0].mean() < + mean_score_thr): + continue + filtered_ans.append(ans[0][i]) + filtered_ans = np.asarray(filtered_ans) + + return [filtered_ans] + + @abstractmethod + def parse(self, *args, **kwargs): + """Group keypoints into poses.""" + + +class HeatmapParser(BaseBottomUpParser): + """The associative embedding parser. + + Paper ref: Alejandro Newell et al. "Associative Embedding: + End-to-end Learning for Joint Detection and Grouping." (NeurIPS'2017) + + Adapted from https://github.com/princeton-vl/pose-ae-train/ + Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License. + """ + + def __init__(self, cfg): + super().__init__(cfg) + self.tag_per_joint = cfg['tag_per_joint'] + + self.params.tag_threshold = cfg['tag_threshold'] + self.params.use_detection_val = cfg['use_detection_val'] + + def match(self, tag_k, loc_k, val_k): + """Group keypoints to human poses in a batch. + + Args: + tag_k (np.ndarray[NxKxMxL]): tag corresponding to the + top k values of feature map per keypoint. + loc_k (np.ndarray[NxKxMx2]): top k locations of the + feature maps for keypoint. + val_k (np.ndarray[NxKxM]): top k value of the + feature maps per keypoint. + + Returns: + list + """ + + def _match(x): + return _match_by_tag(x, self.params) + + return list(map(_match, zip(tag_k, loc_k, val_k))) + + def top_k(self, heatmaps, tags): + """Find top_k values in the feature maps. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + max number of people: M + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmaps (torch.Tensor[NxKxHxW]) + tags (torch.Tensor[NxKxHxWxL]) + + Return: + dict: A dict containing top_k values. + + - tag_k (np.ndarray[NxKxMxL]): + tag corresponding to the top k values of + feature map per keypoint. + - loc_k (np.ndarray[NxKxMx2]): + top k location of feature map per keypoint. + - val_k (np.ndarray[NxKxM]): + top k value of feature map per keypoint. + """ + + heatmaps = self.nms(heatmaps) + N, K, H, W = heatmaps.size() + val_k, ind_k = self.top_k_value(heatmaps, self.params.max_num_people) + + x = ind_k % W + y = ind_k // W + + loc_k = torch.stack((x, y), dim=3) + + tags = tags.view(tags.size(0), tags.size(1), W * H, -1) + if not self.tag_per_joint: + tags = tags.expand(-1, self.params.num_joints, -1, -1) + + tag_k = torch.stack([ + torch.gather(tags[..., i], 2, ind_k) for i in range(tags.size(3)) + ], + dim=3) + + ans = { + 'tag_k': tag_k.cpu().numpy(), + 'loc_k': loc_k.cpu().numpy(), + 'val_k': val_k.cpu().numpy() + } + + return ans + + def parse(self, heatmaps, tags, adjust=True, refine=True, filter=False): """Group keypoints into poses given heatmap and tag. Note: @@ -379,14 +455,14 @@ def parse(self, heatmaps, tags, adjust=True, refine=True): """ ans = self.match(**self.top_k(heatmaps, tags)) + if len(ans) == 0: + return [], [] + + if filter: + ans = self.filter_pose(ans) + if adjust: - if self.use_udp: - for i in range(len(ans)): - if ans[i].shape[0] > 0: - ans[i][..., :2] = post_dark_udp( - ans[i][..., :2].copy(), heatmaps[i:i + 1, :]) - else: - ans = self.adjust(ans, heatmaps) + ans = self.adjust(ans, heatmaps, self.use_udp) scores = [i[:, 2].mean() for i in ans[0]] @@ -404,3 +480,420 @@ def parse(self, heatmaps, tags, adjust=True, refine=True): ans = [ans] return ans, scores + + +class PAFParser(BaseBottomUpParser): + """The part-affinity field parser. + + Paper ref: Cao, Zhe, et al. "OpenPose: realtime multi-person 2D pose + estimation using Part Affinity Fields." (TPAMI'2019) + + Adapted from 'https://github.com/Daniil-Osokin/ + lightweight-human-pose-estimation.pytorch' + + Original licence: Copyright 2018, under Apache License 2.0. + """ + + def __init__(self, cfg): + super().__init__(cfg) + + self.paf_thr = 0.05 + self.add_neck = cfg.get('add_neck', False) + + def output_format(self, all_keypoints, pose_entries): + """Format transform. + + Note: + batch size: N + number of people: M + number of keypoints: K + number of detected keypoints in the image: P + + Args: + all_keypoints (np.ndarray(P, 4)): Each keypoint contains + (x, y, score, keypoint id) + pose_entries (np.ndarray(M, K + 2)): For each person, + it contains K keypoint id, the human score, and + the number of detected keypoints. + + Returns: + ans (list(np.ndarray)): Pose results. + """ + ans = [] + + if len(pose_entries) > 0: + for person in pose_entries: + ans_person = np.zeros( + (self.params.num_joints, all_keypoints.shape[1]), + np.float32) + for j in range(self.params.num_joints): + joint_id = int(person[j]) + if joint_id < 0: + continue + ans_person[j] = all_keypoints[joint_id] + ans.append(ans_person) + return [np.stack(ans)] + else: + return [] + + def connections_nms(self, a_idx, b_idx, affinity_scores): + """From all retrieved connections that share the same starting/ending + keypoints leave only the top-scoring ones. + + Args: + a_idx (list(int)): index of the starting keypoints. + b_idx (list(int)): index of the ending keypoints. + affinity_scores (list(float)): affinity scores. + + Returns: + a_idx (list(int)): index of the starting keypoints. + b_idx (list(int)): index of the ending keypoints. + affinity_scores (list(float)): affinity scores. + """ + order = affinity_scores.argsort()[::-1] + affinity_scores = affinity_scores[order] + a_idx = a_idx[order] + b_idx = b_idx[order] + idx = [] + has_kpt_a = set() + has_kpt_b = set() + for t, (i, j) in enumerate(zip(a_idx, b_idx)): + if i not in has_kpt_a and j not in has_kpt_b: + idx.append(t) + has_kpt_a.add(i) + has_kpt_b.add(j) + idx = np.asarray(idx, dtype=np.int32) + return a_idx[idx], b_idx[idx], affinity_scores[idx] + + def group_keypoints(self, all_keypoints_by_type, pafs): + """Group keypoints based on part-affinity fields. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + + Args: + all_keypoints_by_type (list(tuple)): list of tuples + containing keypoint detection results (x, y, score, id). + pafs (np.ndarray[W, H, C]): part-affinity fields + + Returns: + - ans (list(np.array([M, K, 3+]))): Keypoint predictions. + - scores (list): Score of people. + """ + pose_entries = [] + all_keypoints = np.array( + [item for sublist in all_keypoints_by_type for item in sublist]) + points_per_limb = 10 + grid = np.arange(points_per_limb, dtype=np.float32).reshape(1, -1, 1) + all_keypoints_by_type = [ + np.array(keypoints, np.float32) + for keypoints in all_keypoints_by_type + ] + for part_id in range(len(self.limb2paf)): + part_pafs = pafs[:, :, self.limb2paf[part_id]] + kpts_a = all_keypoints_by_type[self.limb2joint[part_id][0]] + kpts_b = all_keypoints_by_type[self.limb2joint[part_id][1]] + n = len(kpts_a) + m = len(kpts_b) + if n == 0 or m == 0: + continue + + # Get vectors between all pairs of keypoints, + # i.e. candidate limb vectors. + a = kpts_a[:, :2] + a = np.broadcast_to(a[None], (m, n, 2)) + b = kpts_b[:, :2] + vec_raw = (b[:, None, :] - a).reshape(-1, 1, 2) + + # Sample points along every candidate limb vector. + steps = (1 / (points_per_limb - 1) * vec_raw) + points = steps * grid + a.reshape(-1, 1, 2) + points = points.round().astype(dtype=np.int32) + x = points[..., 0].ravel() + y = points[..., 1].ravel() + + # Compute affinity score between candidate + # limb vectors and part affinity field. + field = part_pafs[y, x].reshape(-1, points_per_limb, 2) + vec_norm = np.linalg.norm(vec_raw, ord=2, axis=-1, keepdims=True) + vec = vec_raw / (vec_norm + 1e-6) + affinity_scores = (field * + vec).sum(-1).reshape(-1, points_per_limb) + valid_affinity_scores = affinity_scores > self.paf_thr + valid_num = valid_affinity_scores.sum(1) + affinity_scores = (affinity_scores * + valid_affinity_scores).sum(1) / ( + valid_num + 1e-6) + success_ratio = valid_num / points_per_limb + + # Get a list of limbs according to the obtained affinity score. + valid_limbs = np.where( + np.logical_and(affinity_scores > 0, success_ratio > 0.8))[0] + if len(valid_limbs) == 0: + continue + b_idx, a_idx = np.divmod(valid_limbs, n) + affinity_scores = affinity_scores[valid_limbs] + + # Suppress incompatible connections. + a_idx, b_idx, affinity_scores = self.connections_nms( + a_idx, b_idx, affinity_scores) + connections = list( + zip(kpts_a[a_idx, 3].astype(np.int32), + kpts_b[b_idx, 3].astype(np.int32), affinity_scores)) + if len(connections) == 0: + continue + + if part_id == 0: + pose_entries = [ + np.ones(self.params.num_joints + 2) * -1 + for _ in range(len(connections)) + ] + for i in range(len(connections)): + pose_entries[i][self.limb2joint[0][0]] = connections[i][0] + pose_entries[i][self.limb2joint[0][1]] = connections[i][1] + pose_entries[i][-1] = 2 + pose_entries[i][-2] = np.sum( + all_keypoints[connections[i][0:2], + 2]) + connections[i][2] + else: + kpt_a_id = self.limb2joint[part_id][0] + kpt_b_id = self.limb2joint[part_id][1] + for i in range(len(connections)): + found_pose_list = [] + for j in range(len(pose_entries)): + if pose_entries[j][kpt_a_id] == connections[i][ + 0] and pose_entries[j][kpt_b_id] == -1: + pose_entries[j][kpt_b_id] = connections[i][1] + pose_entries[j][-1] += 1 + pose_entries[j][-2] += all_keypoints[ + connections[i][1], 2] + connections[i][2] + found_pose_list.append( + (j, all_keypoints[connections[i][1], 2] + + connections[i][2])) + + if pose_entries[j][kpt_b_id] == connections[i][ + 1] and pose_entries[j][kpt_a_id] == -1: + pose_entries[j][kpt_a_id] = connections[i][0] + pose_entries[j][-1] += 1 + pose_entries[j][-2] += all_keypoints[ + connections[i][1], 2] + connections[i][2] + found_pose_list.append( + (j, all_keypoints[connections[i][1], 2] + + connections[i][2])) + + if len(found_pose_list) == 0: + pose_entry = np.ones(self.params.num_joints + 2) * -1 + pose_entry[kpt_a_id] = connections[i][0] + pose_entry[kpt_b_id] = connections[i][1] + pose_entry[-1] = 2 + pose_entry[-2] = np.sum( + all_keypoints[connections[i][0:2], + 2]) + connections[i][2] + pose_entries.append(pose_entry) + + elif len(found_pose_list) == 2: + # merge two pose entries + found_pose_list.sort(key=lambda x: x[0], reverse=True) + pose_entry = np.ones(self.params.num_joints + 2) * -1 + + entry_id1, score1 = found_pose_list[0] + entry_id2, score2 = found_pose_list[1] + assert score1 == score2 + + pose_entry1 = pose_entries.pop(entry_id1) + pose_entry2 = pose_entries.pop(entry_id2) + + num_kpt = 0 + score = pose_entry1[-2] + pose_entry2[-2] - score1 + + for j in range(self.params.num_joints): + kpt_id1 = int(pose_entry1[j]) + kpt_id2 = int(pose_entry2[j]) + + if kpt_id1 == -1 and kpt_id2 == -1: + continue + elif kpt_id1 == -1 and kpt_id2 != -1: + pose_entry[j] = kpt_id2 + num_kpt += 1 + elif kpt_id2 == -1 and kpt_id1 != -1: + pose_entry[j] = kpt_id1 + num_kpt += 1 + else: + # both have the same joint-id, + # choose the one with higher score. + if all_keypoints[kpt_id1, + 2] > all_keypoints[kpt_id2, + 2]: + pose_entry[j] = kpt_id1 + else: + pose_entry[j] = kpt_id2 + num_kpt += 1 + + pose_entry[-2] = score + pose_entry[-1] = num_kpt + + pose_entries.append(pose_entry) + + ans = self.output_format(all_keypoints, pose_entries) + scores = [person[-2] for person in pose_entries] + + return ans, scores + + def get_keypoints(self, heatmaps): + """Extract keypoints from heatmaps. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + + Args: + heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps. + + Returns: + list(tuple): list of tuples containing keypoint detection + results (x, y, score, id). + """ + + keypoint_num = 0 + all_keypoints_by_type = [[] for _ in range(heatmaps.size(1))] + + heatmaps = self.nms(heatmaps) + N, K, H, W = heatmaps.size() + val_k, ind_k = self.top_k_value(heatmaps, self.params.max_num_people) + + x = ind_k % W + y = ind_k // W + + loc_k = torch.stack((x, y), dim=3) + + for kpt_idx in range(self.params.num_joints): + for m in range(self.params.max_num_people): + if val_k[0][kpt_idx][m] < self.params.detection_threshold: + break + else: + x = loc_k[0][kpt_idx][m][0].item() + y = loc_k[0][kpt_idx][m][1].item() + score = val_k[0][kpt_idx][m].item() + all_keypoints_by_type[kpt_idx].append( + (x, y, score, keypoint_num)) + keypoint_num += 1 + + return all_keypoints_by_type + + def define_limb(self, skeleton): + if self.add_neck: + # Heatmap indices to find each limb (joint connection). + self.limb2joint = [[1, 2], [1, 5], [2, 3], [3, 4], [5, 6], [6, 7], + [1, 8], [8, 9], [9, 10], [1, 11], [11, 12], + [12, 13], [1, 0], [0, 14], [14, 16], [0, 15], + [15, 17], [2, 16], [5, 17]] + + # PAF indices containing the x and y coordinates of the PAF for a + # given limb. + self.limb2paf = [[12, 13], [20, 21], [14, 15], [16, 17], [22, 23], + [24, 25], [0, 1], [2, 3], [4, 5], [6, 7], [8, 9], + [10, 11], [28, 29], [30, 31], [34, 35], [32, 33], + [36, 37], [18, 19], [26, 27]] + + elif skeleton is None: + # Heatmap indices to find each limb (joint connection). + self.limb2joint = [[15, 13], [13, 11], [16, 14], [14, 12], + [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], + [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], + [1, 3], [2, 4], [3, 5], [4, 6]] + + # PAF indices containing the x and y coordinates of the PAF for a + # given limb. + self.limb2paf = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], + [12, 13], [14, 15], [16, 17], [18, 19], [20, 21], + [22, 23], [24, 25], [26, 27], [28, 29], [30, 31], + [32, 33], [34, 35], [36, 37]] + + else: + # Heatmap indices to find each limb (joint connection). + self.limb2joint = skeleton + + # PAF indices containing the x and y coordinates of the PAF for a + # given limb. + self.limb2paf = np.array(range(len(self.limb2joint * + 2))).reshape(-1, 2).tolist() + + self.NUM_LIMBS = len(self.limb2joint) + + def parse(self, + heatmaps, + pafs, + skeleton=None, + adjust=True, + refine=True, + filter=False): + """Group keypoints into poses given heatmap and paf. + + Note: + batch size: N (currently we only support N==1) + number of people: M + number of keypoints: K + number of paf maps: P + heatmap height: H + heatmap width: W + + Args: + heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps. + pafs (torch.Tensor[NxPxHxW]): model output pafs. + + Returns: + tuple: A tuple containing keypoint grouping results. + + - ans (list(np.array([M,K,4]))): Keypoint predictions. + - scores (list): Score of people. + """ + + assert heatmaps.shape[0] == 1, 'The batch size is ' \ + f'{heatmaps.shape[0]}, but we only support batch size==1.' + + self.define_limb(skeleton) + + all_keypoints_by_type = self.get_keypoints(heatmaps) + pafs_np = np.transpose(pafs.detach().cpu().numpy()[0], [1, 2, 0]) + ans, scores = self.group_keypoints(all_keypoints_by_type, pafs_np) + + if len(ans) == 0: + return [], [] + + if filter: + ans = self.filter_pose(ans) + + if adjust: + if self.use_udp: + for i in range(len(ans)): + if ans[i].shape[0] > 0: + ans[i][..., :2] = post_dark_udp( + ans[i][..., :2].copy(), heatmaps[i:i + 1, :]) + else: + ans = self.adjust(ans, heatmaps) + + if refine: + ans = ans[0] + # for every detected person + for i in range(len(ans)): + heatmap_numpy = heatmaps[0].cpu().numpy() + _, image_height, image_width = heatmap_numpy.shape + y_coords = 2.0 * np.repeat( + np.arange(image_height)[:, None], image_width, + axis=1) / (image_height - 1.0) - 1.0 + x_coords = 2.0 * np.repeat( + np.arange(image_width)[None, :], image_height, + axis=0) / (image_width - 1.0) - 1.0 + coord_numpy = np.tile( + np.stack([x_coords, y_coords], axis=-1), + (self.params.num_joints, 1, 1, 1)) + ans[i] = self.refine( + heatmap_numpy, coord_numpy, ans[i], use_udp=self.use_udp) + ans = [ans] + + return ans, scores diff --git a/mmpose/core/post_processing/post_transforms.py b/mmpose/core/post_processing/post_transforms.py index ba6594f778..96091daa3e 100644 --- a/mmpose/core/post_processing/post_transforms.py +++ b/mmpose/core/post_processing/post_transforms.py @@ -186,7 +186,7 @@ def transform_preds(coords, center, scale, output_size, use_udp=False): scale_x = scale[0] / output_size[0] scale_y = scale[1] / output_size[1] - target_coords = np.ones_like(coords) + target_coords = coords.copy() target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py index 3941f25a39..f26c59f683 100644 --- a/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py @@ -44,6 +44,8 @@ def __init__(self, self.base_sigma = data_cfg['base_sigma'] self.int_sigma = False + self.ann_info['add_neck'] = data_cfg.get('add_neck', False) + self.ann_info['image_size'] = np.array(data_cfg['image_size']) self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) self.ann_info['num_joints'] = data_cfg['num_joints'] @@ -52,6 +54,7 @@ def __init__(self, self.ann_info['inference_channel'] = data_cfg['inference_channel'] self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + self.ann_info['add_neck'] = data_cfg.get('add_neck', False) self.use_nms = data_cfg.get('use_nms', False) self.soft_nms = data_cfg.get('soft_nms', True) diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py index 02fe73a816..99354b84d7 100644 --- a/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py @@ -57,24 +57,47 @@ def __init__(self, test_mode=False): super().__init__(ann_file, img_prefix, data_cfg, pipeline, test_mode) - self.ann_info['flip_index'] = [ - 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15 - ] - self.ann_info['use_different_joint_weights'] = False - self.ann_info['joint_weights'] = np.array( - [ - 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, - 1.2, 1.5, 1.5 - ], - dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) - - # joint index starts from 1 - self.ann_info['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], - [12, 13], [6, 12], [7, 13], [6, 7], - [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], - [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], - [5, 7]] + + if self.ann_info['add_neck']: + self.ann_info['flip_index'] = [ + 0, 1, 5, 6, 7, 2, 3, 4, 11, 12, 13, 8, 9, 10, 15, 14, 17, 16 + ] + + # joint index starts from 0 + self.ann_info['skeleton'] = [[1, 8], [8, 9], [9, 10], [1, 11], + [11, 12], [12, 13], [1, 2], [2, 3], + [3, 4], [2, 16], [1, 5], [5, 6], + [6, 7], [5, 17], [1, 0], [0, 14], + [0, 15], [14, 16], [15, 17]] + + self.ann_info['joint_weights'] = np.array( + [ + 1., 1., 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, + 1.5, 1., 1., 1., 1. + ], + dtype=np.float32).reshape((self.ann_info['num_joints'] + 1, 1)) + + else: + self.ann_info['flip_index'] = [ + 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15 + ] + + # joint index starts from 0 + self.ann_info['skeleton'] = [[15, 13], [13, 11], [16, + 14], [14, 12], + [11, 12], [5, 11], [6, 12], [5, 6], + [5, 7], [6, 8], [7, 9], [8, + 10], [1, 2], + [0, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [4, 6]] + + self.ann_info['joint_weights'] = np.array( + [ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., + 1.2, 1.2, 1.5, 1.5 + ], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) # 'https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/' # 'pycocotools/cocoeval.py#L523' @@ -144,10 +167,40 @@ def _get_single(self, idx): mask = self._get_mask(anno, idx) anno = [ - obj for obj in anno + obj.copy() for obj in anno if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0 ] + if self.ann_info['add_neck']: + reorder_map = [ + 0, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3 + ] + for obj in anno: + keypoints = np.array(obj['keypoints']).reshape( + self.ann_info['num_joints'], 3) + converted_keypoints = np.zeros( + (self.ann_info['num_joints'] + 1, 3)) + for i, idx in enumerate(reorder_map): + if i == 0: + converted_keypoints[i] = keypoints[idx] + else: + converted_keypoints[i + 1] = keypoints[idx] + + # Add neck as a mean of shoulders + converted_keypoints[1, 0] = (keypoints[5, 0] + + keypoints[6, 0]) / 2 + converted_keypoints[1, 1] = (keypoints[5, 1] + + keypoints[6, 1]) / 2 + if keypoints[5][2] == 2 and keypoints[6][2] == 2: + converted_keypoints[1][2] = 2 + elif keypoints[5][2] == 0 or keypoints[6][2] == 0: + converted_keypoints[1][2] = 0 + else: + converted_keypoints[1][2] = 1 + + keypoints = list(converted_keypoints.reshape(-1)) + obj['keypoints'] = keypoints + joints = self._get_joints(anno) mask_list = [mask.copy() for _ in range(self.ann_info['num_scales'])] joints_list = [ @@ -168,14 +221,24 @@ def _get_joints(self, anno): num_people = len(anno) if self.ann_info['scale_aware_sigma']: - joints = np.zeros((num_people, self.ann_info['num_joints'], 4), - dtype=np.float32) + if self.ann_info['add_neck']: + joints = np.zeros( + (num_people, self.ann_info['num_joints'] + 1, 4), + dtype=np.float32) + else: + joints = np.zeros((num_people, self.ann_info['num_joints'], 4), + dtype=np.float32) else: - joints = np.zeros((num_people, self.ann_info['num_joints'], 3), - dtype=np.float32) + if self.ann_info['add_neck']: + joints = np.zeros( + (num_people, self.ann_info['num_joints'] + 1, 3), + dtype=np.float32) + else: + joints = np.zeros((num_people, self.ann_info['num_joints'], 3), + dtype=np.float32) for i, obj in enumerate(anno): - joints[i, :self.ann_info['num_joints'], :3] = \ + joints[i, :, :3] = \ np.array(obj['keypoints']).reshape([-1, 3]) if self.ann_info['scale_aware_sigma']: # get person box @@ -255,11 +318,14 @@ def evaluate(self, outputs, res_folder, metric='mAP', **kwargs): kpts = defaultdict(list) # iterate over images + order_map = [0, 15, 14, 17, 16, 5, 2, 6, 3, 7, 4, 11, 8, 12, 9, 13, 10] for idx, _preds in enumerate(preds): str_image_path = image_paths[idx] image_id = self.name2id[os.path.basename(str_image_path)] # iterate over people for idx_person, kpt in enumerate(_preds): + if self.ann_info['add_neck']: + kpt = kpt[order_map] # use bbox area area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * ( np.max(kpt[:, 1]) - np.min(kpt[:, 1])) diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py index 8abccabbba..0aac6f1c38 100644 --- a/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py @@ -49,6 +49,43 @@ def __init__(self, self.ann_info['flip_index'] = self.get_flip_index_from_flip_pairs( self.ann_info['flip_pairs']) + # joint index starts from 0 + skeleton_body = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [4, 6]] + skeleton_foot = [[15, 17], [15, 18], [15, 19], [16, 20], [16, 21], + [16, 22]] + skeleton_face = [[0, 53], [53, 52], [52, 51], [51, 50], [50, 65], + [65, 66], [66, 67], [67, 68], [65, 70], [70, 69], + [50, 62], [62, 61], [61, 60], [60, 59], [62, 63], + [63, 64], [50, 45], [45, 46], [46, 47], [47, 48], + [48, 49], [50, 44], [44, 43], [43, 42], [42, 41], + [41, 40], [53, 56], [56, 57], [57, 58], [56, 55], + [55, 54], [56, 74], [74, 75], [75, 76], [76, 77], + [74, 73], [73, 72], [72, 71], [74, 85], [85, 86], + [86, 87], [85, 84], [84, 83], [85, 89], [89, 88], + [89, 90], [89, 80], [80, 79], [79, 78], [80, 81], + [81, 82], [80, 31], [31, 32], [32, 33], [33, 34], + [34, 35], [35, 36], [36, 37], [37, 38], [38, 39], + [31, 30], [30, 29], [29, 28], [28, 27], [27, 26], + [26, 25], [25, 24], [24, 23]] + skeleton_lefthand = [[9, 91], [91, 92], [92, 93], [93, 94], [94, 95], + [91, 96], [96, 97], [97, 98], [98, 99], [91, 100], + [100, 101], [101, 102], [102, 103], [91, 104], + [104, 105], [105, 106], [106, 107], [91, 108], + [108, 109], [109, 110], [110, 111]] + skeleton_righthand = [[10, 112], [112, 113], [113, 114], [114, 115], + [115, 116], [112, 117], [117, 118], [118, 119], + [119, 120], [112, 121], [121, 122], [122, 123], + [123, 124], [112, 125], [125, 126], [126, 127], + [127, 128], [112, 129], [129, 130], [130, 131], + [131, 132]] + + self.ann_info['skeleton'] = ( + skeleton_body + skeleton_foot + skeleton_face + skeleton_lefthand + + skeleton_righthand) + self.ann_info['use_different_joint_weights'] = False self.ann_info['joint_weights'] = \ np.ones((self.ann_info['num_joints'], 1), dtype=np.float32) @@ -166,6 +203,22 @@ def _get_joints(self, anno): return joints + def _get_part_score(self, keypoints): + """Get part score for new evaluation tools.""" + kpt_score = 0 + valid_num = 0 + num_joints = int(len(keypoints) / 3) + for n_jt in range(0, num_joints): + t_s = keypoints[n_jt * 3 + 2] + if t_s > 0.2: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + part_score = kpt_score + + return float(part_score) + def _coco_keypoint_results_one_category_kernel(self, data_pack): """Get coco keypoint results.""" cat_id = data_pack['cat_id'] @@ -211,6 +264,18 @@ def _coco_keypoint_results_one_category_kernel(self, data_pack): key_point[cuts[4]:cuts[5]].tolist(), 'score': img_kpt['score'], + # 'score': + # self._get_part_score(key_point[cuts[0]:cuts[1]]), + # 'foot_score': + # self._get_part_score(key_point[cuts[1]:cuts[2]]), + # 'face_score': + # self._get_part_score(key_point[cuts[2]:cuts[3]]), + # 'lefthand_score': + # self._get_part_score(key_point[cuts[3]:cuts[4]]), + # 'righthand_score': + # self._get_part_score(key_point[cuts[4]:cuts[5]]), + # 'wholebody_score': + # img_kpt['score'], 'bbox': [left_top[0], left_top[1], w, h] }) diff --git a/mmpose/datasets/pipelines/bottom_up_transform.py b/mmpose/datasets/pipelines/bottom_up_transform.py index de43945e42..461959483d 100644 --- a/mmpose/datasets/pipelines/bottom_up_transform.py +++ b/mmpose/datasets/pipelines/bottom_up_transform.py @@ -138,13 +138,26 @@ class HeatmapGenerator: Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, output_size, num_joints, sigma=-1, use_udp=False): + def __init__(self, + output_size, + num_joints, + sigma=-1, + use_udp=False, + add_neck=False, + with_bg=False): self.output_size = output_size self.num_joints = num_joints + self.add_neck = add_neck + self.with_bg = with_bg + + if self.add_neck: + self.num_joints += 1 + if sigma < 0: sigma = self.output_size / 64 self.sigma = sigma size = 6 * sigma + 3 + self.use_udp = use_udp if use_udp: self.x = np.arange(0, size, 1, np.float32) @@ -157,8 +170,15 @@ def __init__(self, output_size, num_joints, sigma=-1, use_udp=False): def __call__(self, joints): """Generate heatmaps.""" - hms = np.zeros((self.num_joints, self.output_size, self.output_size), - dtype=np.float32) + if self.with_bg: + hms = np.zeros( + (self.num_joints + 1, self.output_size, self.output_size), + dtype=np.float32) + else: + hms = np.zeros( + (self.num_joints, self.output_size, self.output_size), + dtype=np.float32) + sigma = self.sigma for p in joints: for idx, pt in enumerate(p): @@ -189,6 +209,8 @@ def __call__(self, joints): hms[idx, aa:bb, cc:dd] = np.maximum(hms[idx, aa:bb, cc:dd], g[a:b, c:d]) + if self.with_bg: + hms[-1] = 1 - np.max(hms[:-1], axis=0) return hms @@ -283,17 +305,20 @@ def _accumulate_paf_map_(self, pafs, src, dst, count): min_y = max(np.floor(min(src[1], dst[1]) - self.limb_width), 0) max_y = min( np.ceil(max(src[1], dst[1]) + self.limb_width), - self.output_size + 1) + self.output_size - 1) range_x = list(range(int(min_x), int(max_x + 1), 1)) range_y = list(range(int(min_y), int(max_y + 1), 1)) - xx, yy = np.meshgrid(range_x, range_y) - delta_x = xx - src[0] - delta_y = yy - src[1] - dist = np.abs(delta_x * unit_limb_vec[1] - delta_y * unit_limb_vec[0]) - mask_local = (dist < self.limb_width) + mask = np.zeros_like(count, dtype=bool) - mask[xx, yy] = mask_local + if len(range_x) > 0 and len(range_y) > 0: + xx, yy = np.meshgrid(range_x, range_y) + delta_x = xx - src[0] + delta_y = yy - src[1] + dist = np.abs(delta_x * unit_limb_vec[1] - + delta_y * unit_limb_vec[0]) + mask_local = (dist < self.limb_width) + mask[yy, xx] = mask_local pafs[0, mask] += unit_limb_vec[0] pafs[1, mask] += unit_limb_vec[1] @@ -312,8 +337,8 @@ def __call__(self, joints): dtype=np.float32) for p in joints: - src = p[sk[0] - 1] - dst = p[sk[1] - 1] + src = p[sk[0]] + dst = p[sk[1]] if src[2] > 0 and dst[2] > 0: self._accumulate_paf_map_(pafs[2 * idx:2 * idx + 2], src[:2], dst[:2], count) @@ -347,9 +372,9 @@ def __call__(self, results): assert len(mask) == len(self.output_size) if np.random.random() < self.flip_prob: - image = image[:, ::-1] - np.zeros_like(image) + image = image[:, ::-1].copy() - np.zeros_like(image) for i, _output_size in enumerate(self.output_size): - mask[i] = mask[i][:, ::-1] + mask[i] = mask[i][:, ::-1].copy() joints[i] = joints[i][:, self.flip_index] joints[i][:, :, 0] = _output_size - joints[i][:, :, 0] - 1 results['img'], results['mask'], results[ @@ -517,14 +542,17 @@ class BottomUpGenerateHeatmapTarget: Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, sigma, use_udp=False): + def __init__(self, sigma, use_udp=False, add_neck=False, with_bg=False): self.sigma = sigma self.use_udp = use_udp + self.add_neck = add_neck + self.with_bg = with_bg def _generate(self, num_joints, heatmap_size): """Get heatmap generator.""" heatmap_generator = [ - HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp) + HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp, + self.add_neck, self.with_bg) for output_size in heatmap_size ] return heatmap_generator @@ -535,19 +563,22 @@ def __call__(self, results): self._generate(results['ann_info']['num_joints'], results['ann_info']['heatmap_size']) target_list = list() - joints_list = results['joints'] + mask_list, joints_list = results['mask'], results['joints'] for scale_id in range(results['ann_info']['num_scales']): heatmaps = heatmap_generator[scale_id](joints_list[scale_id]) target_list.append(heatmaps.astype(np.float32)) - results['target'] = target_list + mask_list[scale_id] = mask_list[scale_id].astype(np.float32) + + results['targets'] = target_list + results['masks'] = mask_list return results @PIPELINES.register_module() class BottomUpGenerateTarget: - """Generate multi-scale heatmap target for bottom-up. + """Generate multi-scale heatmap target for associate embedding. Args: sigma (int): Sigma of heatmap Gaussian @@ -625,21 +656,20 @@ def __call__(self, results): if self.skeleton is None: assert results['ann_info']['skeleton'] is not None self.skeleton = results['ann_info']['skeleton'] - else: - assert np.array( - self.skeleton).max() < results['ann_info']['num_joints'] paf_generator = \ self._generate(results['ann_info']['heatmap_size'], self.skeleton) target_list = list() - joints_list = results['joints'] + mask_list, joints_list = results['mask'], results['joints'] for scale_id in range(results['ann_info']['num_scales']): pafs = paf_generator[scale_id](joints_list[scale_id]) target_list.append(pafs.astype(np.float32)) + mask_list[scale_id] = mask_list[scale_id].astype(np.float32) - results['target'] = target_list + results['targets'] = target_list + results['masks'] = mask_list return results @@ -651,16 +681,19 @@ class BottomUpGetImgSize: `results['ann_info']['image_size']×current_scale`. Args: - test_scale_factor (List[float]): Multi scale + test_scale_factor (List[float]): Multi scale. + max_input_size (int): Constraint of the max input size. current_scale (int): default 1 use_udp (bool): To use unbiased data processing. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, test_scale_factor, current_scale=1, use_udp=False): + def __init__(self, test_scale_factor, max_input_size=None, + current_scale=1, use_udp=False): self.test_scale_factor = test_scale_factor self.min_scale = min(test_scale_factor) + self.max_input_size = max_input_size self.current_scale = current_scale self.use_udp = use_udp @@ -701,7 +734,18 @@ def __call__(self, results): center = (scale_w / 2.0, scale_h / 2.0) else: center = np.array([round(w / 2.0), round(h / 2.0)]) - results['ann_info']['test_scale_factor'] = self.test_scale_factor + + # calculate the test scale factor + if self.max_input_size is not None: + test_scale_factor = np.array(self.test_scale_factor) + accept_scale_w = (test_scale_factor * w_resized) < self.max_input_size + accept_scale_h = (test_scale_factor * h_resized) < self.max_input_size + keep = (accept_scale_w * accept_scale_h) > 0 + test_scale_factor = test_scale_factor[keep].tolist() + else: + test_scale_factor = self.test_scale_factor + + results['ann_info']['test_scale_factor'] = test_scale_factor results['ann_info']['base_size'] = (w_resized, h_resized) results['ann_info']['center'] = center results['ann_info']['scale'] = np.array([scale_w, scale_h]) diff --git a/mmpose/datasets/pipelines/shared_transform.py b/mmpose/datasets/pipelines/shared_transform.py index 15e837c9a8..352418e331 100644 --- a/mmpose/datasets/pipelines/shared_transform.py +++ b/mmpose/datasets/pipelines/shared_transform.py @@ -414,29 +414,31 @@ class MultitaskGatherTarget: pipeline_indices (list[int]): Pipeline index of each head. """ - def __init__(self, pipeline_list, pipeline_indices): + def __init__(self, + pipeline_list, + pipeline_indices=None, + keys=('target', 'target_weight')): + self.keys = keys self.pipelines = [] for pipeline in pipeline_list: self.pipelines.append(Compose(pipeline)) - self.pipeline_indices = pipeline_indices + if pipeline_indices is None: + self.pipeline_indices = list(range(len(pipeline_list))) + else: + self.pipeline_indices = pipeline_indices def __call__(self, results): # generate target and target weights using all pipelines - _target, _target_weight = [], [] + pipeline_outputs = [] for pipeline in self.pipelines: - results_head = pipeline(results) - _target.append(results_head['target']) - _target_weight.append(results_head['target_weight']) - - # reorganize generated target, target_weights according - # to self.pipelines_indices - target, target_weight = [], [] - for ind in self.pipeline_indices: - target.append(_target[ind]) - target_weight.append(_target_weight[ind]) - - results['target'] = target - results['target_weight'] = target_weight + pipeline_output = pipeline(results) + pipeline_outputs.append(pipeline_output.copy()) + + for key in self.keys: + result_key = [] + for ind in self.pipeline_indices: + result_key.append(pipeline_outputs[ind].get(key, None)) + results[key] = result_key return results diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index eb3959d495..45bb35fb24 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -2,9 +2,12 @@ from .cpm import CPM from .hourglass import HourglassNet from .hrnet import HRNet +from .lightweight_openpose import LightweightOpenPoseNetwork from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 from .mspn import MSPN +from .openpose_v1 import OpenPoseNetworkV1 +from .openpose_v2 import OpenPoseNetworkV2 from .regnet import RegNet from .resnest import ResNeSt from .resnet import ResNet, ResNetV1d @@ -22,5 +25,6 @@ 'AlexNet', 'HourglassNet', 'HRNet', 'MobileNetV2', 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', - 'TCN' + 'TCN', 'OpenPoseNetworkV1', 'OpenPoseNetworkV2', + 'LightweightOpenPoseNetwork' ] diff --git a/mmpose/models/backbones/cpm.py b/mmpose/models/backbones/cpm.py index 3aa09523ed..1659589bac 100644 --- a/mmpose/models/backbones/cpm.py +++ b/mmpose/models/backbones/cpm.py @@ -14,22 +14,34 @@ class CpmBlock(nn.Module): """CpmBlock for Convolutional Pose Machine. - Generate module recursively and use BasicBlock as the base unit. - Args: in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. + channels (list): Output channels of each conv module. + kernels (list): Kernel sizes of each conv module. """ - def __init__(self, in_channels, out_channels, norm_cfg=None): + def __init__(self, + in_channels, + channels=(128, 128, 128), + kernels=(11, 11, 11), + norm_cfg=None): super().__init__() - self.model = nn.Sequential( - ConvModule( - in_channels, out_channels, 11, padding=5, norm_cfg=norm_cfg), - ConvModule( - out_channels, out_channels, 11, padding=5, norm_cfg=norm_cfg), - ConvModule( - out_channels, out_channels, 11, padding=5, norm_cfg=norm_cfg)) + + assert len(channels) == len(kernels) + layers = [] + for i in range(len(channels)): + if i == 0: + input_channels = in_channels + else: + input_channels = channels[i - 1] + layers.append( + ConvModule( + input_channels, + channels[i], + kernels[i], + padding=(kernels[i] - 1) // 2, + norm_cfg=norm_cfg)) + self.model = nn.Sequential(*layers) def forward(self, x): """Model forward function.""" @@ -107,8 +119,11 @@ def __init__(self, nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) self.cpm_stages = nn.ModuleList([ - CpmBlock(middle_channels + out_channels, feat_channels, norm_cfg) - for _ in range(num_stages - 1) + CpmBlock( + middle_channels + out_channels, + channels=[feat_channels, feat_channels, feat_channels], + kernels=[11, 11, 11], + norm_cfg=norm_cfg) for _ in range(num_stages - 1) ]) self.middle_conv = nn.ModuleList([ diff --git a/mmpose/models/backbones/lightweight_openpose.py b/mmpose/models/backbones/lightweight_openpose.py new file mode 100644 index 0000000000..3ca73ab22b --- /dev/null +++ b/mmpose/models/backbones/lightweight_openpose.py @@ -0,0 +1,487 @@ +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint + + +class CpmLayer(nn.Module): + """A CPM-type layer. + + Args: + in_channels (int): The input channels. + out_channels (int): The output channels. + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + self.pre_conv = ConvModule( + in_channels, out_channels, 1, padding=0, norm_cfg=None, bias=True) + self.feat = nn.Sequential( + ConvModule( + out_channels, + out_channels, + 3, + padding=1, + groups=out_channels, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None), + ConvModule( + out_channels, + out_channels, + 1, + padding=0, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None), + ConvModule( + out_channels, + out_channels, + 3, + padding=1, + groups=out_channels, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None), + ConvModule( + out_channels, + out_channels, + 1, + padding=0, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None), + ConvModule( + out_channels, + out_channels, + 3, + padding=1, + groups=out_channels, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None), + ConvModule( + out_channels, + out_channels, + 1, + padding=0, + bias=False, + act_cfg=dict(type='ELU'), + norm_cfg=None)) + self.out_conv = ConvModule( + out_channels, out_channels, 3, padding=1, norm_cfg=None, bias=True) + + def forward(self, x): + x = self.pre_conv(x) + x = self.out_conv(x + self.feat(x)) + return x + + +class InitialStage(nn.Module): + """The initial stage. + + Args: + in_channels (int): The input channels. + mid_channels (int): The middle-layer channels. + out_channels_cm (int): The output channels for CM ( + confidence map, or heatmap). + out_channels_paf (int): The output channels for PAF ( + part-affinity field). + """ + + def __init__(self, in_channels, mid_channels, out_channels_cm, + out_channels_paf): + super().__init__() + self.feat = nn.Sequential( + ConvModule( + in_channels, + in_channels, + 3, + padding=1, + norm_cfg=None, + bias=True), + ConvModule( + in_channels, + in_channels, + 3, + padding=1, + norm_cfg=None, + bias=True), + ConvModule( + in_channels, + in_channels, + 3, + padding=1, + norm_cfg=None, + bias=True)) + self.cm_out_conv = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + kernel_size=1, + padding=0, + norm_cfg=None, + bias=True), + ConvModule( + mid_channels, + out_channels_cm, + kernel_size=1, + padding=0, + norm_cfg=None, + act_cfg=None, + bias=True)) + self.paf_out_conv = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + kernel_size=1, + padding=0, + norm_cfg=None, + bias=True), + ConvModule( + mid_channels, + out_channels_paf, + kernel_size=1, + padding=0, + norm_cfg=None, + act_cfg=None, + bias=True)) + + def forward(self, x): + features = self.feat(x) + cm_output = self.cm_out_conv(features) + paf_output = self.paf_out_conv(features) + return [cm_output, paf_output] + + +class RefinementStageBlock(nn.Module): + """The block for the refinement stage. + + Args: + in_channels (int): The input channels. + out_channels (int): The output channels. + norm_cfg (dict): Dictionary to construct and config norm layer. + """ + + def __init__(self, + in_channels, + out_channels, + norm_cfg=dict(type='BN', requires_grad=True)): + super().__init__() + self.pre_conv = ConvModule( + in_channels, out_channels, 1, padding=0, norm_cfg=None, bias=True) + self.feat = nn.Sequential( + ConvModule( + out_channels, + out_channels, + 3, + padding=1, + norm_cfg=norm_cfg, + bias=True), + ConvModule( + out_channels, + out_channels, + 3, + dilation=2, + padding=2, + norm_cfg=norm_cfg, + bias=True)) + + def forward(self, x): + pre_features = self.pre_conv(x) + features = self.feat(pre_features) + return pre_features + features + + +class RefinementStage(nn.Module): + """The refinement stage. + + Args: + in_channels (int): The input channels. + mid_channels (int): The middle-layer channels. + out_channels_cm (int): The output channels for CM ( + confidence map, or heatmap). + out_channels_paf (int): The output channels for PAF ( + part-affinity field). + norm_cfg (dict): Dictionary to construct and config norm layer. + """ + + def __init__(self, + in_channels, + mid_channels, + out_channels_cm, + out_channels_paf, + norm_cfg=dict(type='BN', requires_grad=True)): + super().__init__() + self.feat = nn.Sequential( + RefinementStageBlock(in_channels, mid_channels, norm_cfg), + RefinementStageBlock(mid_channels, mid_channels, norm_cfg), + RefinementStageBlock(mid_channels, mid_channels, norm_cfg), + RefinementStageBlock(mid_channels, mid_channels, norm_cfg), + RefinementStageBlock(mid_channels, mid_channels, norm_cfg)) + self.cm_out_conv = nn.Sequential( + ConvModule( + mid_channels, + mid_channels, + 1, + padding=0, + norm_cfg=None, + bias=True), + ConvModule( + mid_channels, + out_channels_cm, + 1, + padding=0, + norm_cfg=None, + act_cfg=None, + bias=True)) + self.paf_out_conv = nn.Sequential( + ConvModule( + mid_channels, + mid_channels, + 1, + padding=0, + norm_cfg=None, + bias=True), + ConvModule( + mid_channels, + out_channels_paf, + 1, + padding=0, + norm_cfg=None, + act_cfg=None, + bias=True)) + + def forward(self, x): + features = self.feat(x) + cm_output = self.cm_out_conv(features) + paf_output = self.paf_out_conv(features) + return [cm_output, paf_output] + + +@BACKBONES.register_module() +class LightweightOpenPoseNetwork(BaseBackbone): + """Lightweight OpenPose backbone Network. + + Real-time 2D Multi-Person Pose Estimation on + CPU: Lightweight OpenPose + + More details can be found in the `paper + `__ . + + Args: + in_channels (int): The input channels. + out_channels_cm (int): The output channels for CM ( + confidence map, or heatmap). + out_channels_paf (int): The output channels for PAF ( + part-affinity field). + stem_feat_channels (int): Feature channel of the stem network. + num_stages (int): Number of stages. + norm_cfg (dict): Dictionary to construct and config norm layer. + + Example: + >>> from mmpose.models import LightweightOpenPoseNetwork + >>> import torch + >>> self = LightweightOpenPoseNetwork(3, 19, 38) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 368, 368) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + """ + + def __init__(self, + in_channels, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=2, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + assert in_channels == 3 + + self.num_stages = num_stages + assert self.num_stages >= 1 + + self.features = nn.Sequential( + ConvModule( + in_channels, + 32, + 3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + bias=False), + # conv_dw(32, 64) + ConvModule( + 32, 32, 3, padding=1, groups=32, bias=False, + norm_cfg=norm_cfg), + ConvModule(32, 64, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(64, 128, stride=2) + ConvModule( + 64, + 64, + 3, + stride=2, + padding=1, + groups=64, + bias=False, + norm_cfg=norm_cfg), + ConvModule(64, 128, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(128, 128) + ConvModule( + 128, + 128, + 3, + padding=1, + groups=128, + bias=False, + norm_cfg=norm_cfg), + ConvModule(128, 128, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(128, 256, stride=2) + ConvModule( + 128, + 128, + 3, + stride=2, + padding=1, + groups=128, + bias=False, + norm_cfg=norm_cfg), + ConvModule(128, 256, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(256, 256) + ConvModule( + 256, + 256, + 3, + padding=1, + groups=256, + bias=False, + norm_cfg=norm_cfg), + ConvModule(256, 256, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(256, 512) + ConvModule( + 256, + 256, + 3, + padding=1, + groups=256, + bias=False, + norm_cfg=norm_cfg), + ConvModule(256, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(512, 512, dilation=2, padding=2) + ConvModule( + 512, + 512, + 3, + padding=2, + dilation=2, + groups=512, + bias=False, + norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(512, 512) + ConvModule( + 512, + 512, + 3, + padding=1, + groups=512, + bias=False, + norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(512, 512) + ConvModule( + 512, + 512, + 3, + padding=1, + groups=512, + bias=False, + norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(512, 512) + ConvModule( + 512, + 512, + 3, + padding=1, + groups=512, + bias=False, + norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg), + # conv_dw(512, 512) + ConvModule( + 512, + 512, + 3, + padding=1, + groups=512, + bias=False, + norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, bias=False, norm_cfg=norm_cfg)) + + self.cpm = CpmLayer(512, stem_feat_channels) + + self.initial_stage = InitialStage(stem_feat_channels, 512, + out_channels_cm, out_channels_paf) + self.refinement_stages = nn.ModuleList() + for idx in range(num_stages - 1): + self.refinement_stages.append( + RefinementStage( + stem_feat_channels + out_channels_cm + out_channels_paf, + stem_feat_channels, + out_channels_cm, + out_channels_paf, + norm_cfg=norm_cfg)) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is not None: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + stem_feat = self.cpm(self.features(x)) + + cm_outputs = [] + paf_outputs = [] + + cm_output, paf_output = self.initial_stage(stem_feat) + cm_outputs.append(cm_output) + paf_outputs.append(paf_output) + + for refinement_stage in self.refinement_stages: + cm_output, paf_output = refinement_stage( + torch.cat([stem_feat, cm_outputs[-1], paf_outputs[-1]], dim=1)) + cm_outputs.append(cm_output) + paf_outputs.append(paf_output) + + return [*cm_outputs, *paf_outputs] diff --git a/mmpose/models/backbones/openpose_v1.py b/mmpose/models/backbones/openpose_v1.py new file mode 100644 index 0000000000..3a49fbc0ef --- /dev/null +++ b/mmpose/models/backbones/openpose_v1.py @@ -0,0 +1,186 @@ +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .cpm import CpmBlock +from .utils import load_checkpoint + + +@BACKBONES.register_module() +class OpenPoseNetworkV1(BaseBackbone): + """OpenPose backbone Network. + + Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. + More details can be found in the `paper + `__ . + + Based on the officially released model + 'https://github.com/CMU-Perceptual-Computing-Lab/openpose/ + blob/master/models/pose/coco/pose_deploy_linevec.prototxt' + + Args: + in_channels (int): The input channels. + out_channels_cm (int): The output channels for CM ( + confidence map, or heatmap). + out_channels_paf (int): The output channels for PAF ( + part-affinity field). + stem_feat_channels (int): Feature channel of the stem network. + num_stages (int): Number of stages. + norm_cfg (dict): Dictionary to construct and config norm layer. + + Example: + >>> from mmpose.models import OpenPoseNetworkV1 + >>> import torch + >>> self = OpenPoseNetwork(3, 19, 38) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 368, 368) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 19, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + """ + + def __init__(self, + in_channels, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + assert in_channels == 3 + + self.num_stages = num_stages + assert self.num_stages >= 1 + + self.features = nn.Sequential( + ConvModule( + in_channels, 64, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(64, 64, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(128, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(256, 512, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(512, 512, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(512, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule( + 256, + stem_feat_channels, + 3, + padding=1, + norm_cfg=norm_cfg, + bias=True)) + + # stage 0 + self.cm_stages = nn.ModuleList([ + CpmBlock(stem_feat_channels, [ + stem_feat_channels, stem_feat_channels, stem_feat_channels, 512 + ], [3, 3, 3, 1], norm_cfg) + ]) + self.paf_stages = nn.ModuleList([ + CpmBlock(stem_feat_channels, [ + stem_feat_channels, stem_feat_channels, stem_feat_channels, 512 + ], [3, 3, 3, 1], norm_cfg) + ]) + + # stage 1 to n-1 + for _ in range(1, self.num_stages): + self.cm_stages.append( + CpmBlock( + stem_feat_channels + out_channels_cm + out_channels_paf, [ + stem_feat_channels, stem_feat_channels, + stem_feat_channels, stem_feat_channels, + stem_feat_channels, stem_feat_channels + ], [7, 7, 7, 7, 7, 1], norm_cfg)) + self.paf_stages.append( + CpmBlock( + stem_feat_channels + out_channels_cm + out_channels_paf, [ + stem_feat_channels, stem_feat_channels, + stem_feat_channels, stem_feat_channels, + stem_feat_channels, stem_feat_channels + ], [7, 7, 7, 7, 7, 1], norm_cfg)) + + self.cm_out_convs = nn.ModuleList() + self.paf_out_convs = nn.ModuleList() + + for i in range(self.num_stages): + if i == 0: + input_channels = 512 + else: + input_channels = stem_feat_channels + self.cm_out_convs.append( + ConvModule(input_channels, out_channels_cm, 1, act_cfg=None)) + self.paf_out_convs.append( + ConvModule(input_channels, out_channels_paf, 1, act_cfg=None)) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + stem_feat = self.features(x) + out_feats = [] + out_feats.append(stem_feat) + + cm_outputs = [] + paf_outputs = [] + + for ind in range(self.num_stages): + cm_stage = self.cm_stages[ind] + paf_stage = self.paf_stages[ind] + + cm_out_conv = self.cm_out_convs[ind] + paf_out_conv = self.paf_out_convs[ind] + + cm_output = cm_out_conv(cm_stage(out_feats[-1])) + cm_outputs.append(cm_output) + paf_output = paf_out_conv(paf_stage(out_feats[-1])) + paf_outputs.append(paf_output) + + out_feat = torch.cat([stem_feat, cm_output, paf_output], 1) + + out_feats.append(out_feat) + + return [*cm_outputs, *paf_outputs] diff --git a/mmpose/models/backbones/openpose_v2.py b/mmpose/models/backbones/openpose_v2.py new file mode 100644 index 0000000000..4166c698f7 --- /dev/null +++ b/mmpose/models/backbones/openpose_v2.py @@ -0,0 +1,303 @@ +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint + + +class MconvBlock(nn.Module): + """MconvBlock for replacing convolutions of 7x7 kernel. + + Args: + in_channels (int): Input channels of this block. + channels (list): Output channels of each conv module. + kernels (list): Kernel sizes of each conv module. + """ + + def __init__(self, + in_channels, + channels=(96, 96, 96), + kernels=(3, 3, 3), + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU')): + super().__init__() + + assert len(channels) == len(kernels) + + self.num_layers = len(channels) + + self.model = nn.ModuleList() + for i in range(self.num_layers): + if i == 0: + input_channels = in_channels + else: + input_channels = channels[i - 1] + self.model.append( + ConvModule( + input_channels, + channels[i], + kernels[i], + padding=(kernels[i] - 1) // 2, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + """Model forward function.""" + feat = [] + feat.append(x) + for i in range(self.num_layers): + feat.append(self.model[i](feat[-1])) + out = torch.cat([*feat[1:]], 1) + return out + + +class MconvStage(nn.Module): + """MconvStage. + + Args: + in_channels (int): Input channels of this block. + channels (list): Output channels of each conv module. + kernels (list): Kernel sizes of each conv module. + """ + + def __init__(self, + in_channels, + out_channels, + num_blocks=5, + block_channels=(96, 96, 96), + block_kernels=(3, 3, 3), + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU')): + super().__init__() + + layers = [] + for i in range(num_blocks): + + if i == 0: + input_channels = in_channels + else: + input_channels = sum(block_channels) + + layers.append( + MconvBlock( + input_channels, + channels=block_channels, + kernels=block_kernels, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'))) + + layers.append( + ConvModule( + sum(block_channels), + out_channels, + kernel_size=1, + padding=0, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.model = nn.Sequential(*layers) + + def forward(self, x): + """Model forward function.""" + out = self.model(x) + return out + + +@BACKBONES.register_module() +class OpenPoseNetworkV2(BaseBackbone): + """OpenPose backbone Network. + + Open{P}ose: realtime multi-person 2{D} pose estimation + using {P}art {A}ffinity {F}ields. + More details can be found in the `paper + `__ . + + Based on the officially released model + 'https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/' + 'openpose/master/models/pose/body_25/pose_deploy.prototxt' + + Args: + in_channels (int): The input channels. + out_channels_cm (int): The output channels for CM ( + confidence map, or heatmap). + out_channels_paf (int): The output channels for PAF ( + part-affinity field). + stem_feat_channels (int): Feature channel of the stem network. + num_stages (int): Number of stages. + stage_types (list): Types can be 'CM' or 'PAF'. + num_blocks (int|list): Number of blocks in each stage. If + `num_blocks' is int, the same `num_blocks' will be used + for all stages. + block_channels (int|list): Number of block channels in each + stage. If `block_channels' is int, the same `block_channels' + will be used for all stages. + norm_cfg (dict): Dictionary to construct and config norm layer. + act_cfg (dict): Config dict for activation layer. + + Example: + >>> from mmpose.models import OpenPoseNetworkV2 + >>> import torch + >>> self = OpenPoseNetworkV2(3) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 368, 368) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 38, 46, 46) + (1, 19, 46, 46) + """ + + def __init__(self, + in_channels, + out_channels_cm=19, + out_channels_paf=38, + stem_feat_channels=128, + num_stages=6, + stage_types=('PAF', 'PAF', 'PAF', 'PAF', 'PAF', 'CM'), + num_blocks=5, + block_channels=96, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU')): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + assert in_channels == 3 + assert num_stages == len(stage_types) + + if isinstance(num_blocks, int): + num_blocks = [num_blocks] * num_stages + if isinstance(block_channels, int): + block_channels = [block_channels] * num_stages + + self.num_stages = num_stages + assert self.num_stages >= 1 + + self.features = nn.Sequential( + ConvModule( + in_channels, 64, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(64, 64, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(128, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule(256, 256, 3, padding=1, norm_cfg=norm_cfg, bias=True), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvModule(256, 512, 3, padding=1, norm_cfg=norm_cfg, bias=True), + ConvModule( + 512, + 512, + 3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=True), + ConvModule( + 512, + 256, + 3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=True), + ConvModule( + 256, + stem_feat_channels, + 3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=True)) + + self.stages = nn.ModuleList() + self.out_convs = nn.ModuleList() + + for i, stage_type in enumerate(stage_types): + if i == 0: + input_channels = stem_feat_channels + else: + if stage_types[i - 1] == 'CM': + input_channels = stem_feat_channels + out_channels_cm + else: + # stage_types[i-1] == 'PAF': + input_channels = stem_feat_channels + out_channels_paf + + if stage_type.upper() == 'CM': + self.stages.append( + MconvStage( + input_channels, + 256, + num_blocks=num_blocks[i], + block_channels=[block_channels[i]] * num_blocks[i], + block_kernels=[3] * num_blocks[i], + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'))) + self.out_convs.append( + ConvModule(256, out_channels_cm, 1, act_cfg=None)) + + elif stage_type.upper() == 'PAF': + self.stages.append( + MconvStage( + input_channels, + 256, + num_blocks=num_blocks[i], + block_channels=[block_channels[i]] * num_blocks[i], + block_kernels=[3] * num_blocks[i], + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'))) + self.out_convs.append( + ConvModule(256, out_channels_paf, 1, act_cfg=None)) + + else: + raise ValueError("stage_type should be either 'CM' or 'PAF'.") + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + stem_feat = self.features(x) + out_feats = [] + out_feats.append(stem_feat) + + outputs = [] + + for ind in range(self.num_stages): + single_stage = self.stages[ind] + single_out_conv = self.out_convs[ind] + output = single_out_conv(single_stage(out_feats[-1])) + outputs.append(output) + + out_feat = torch.cat([stem_feat, output], 1) + out_feats.append(out_feat) + + return [*outputs] diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index 5420dfd1c6..cfa964e181 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -2,10 +2,11 @@ from .interhand_3d import Interhand3D from .mesh import ParametricMesh from .multi_task import MultiTask +from .paf import PartAffinityField from .pose_lifter import PoseLifter from .top_down import TopDown __all__ = [ 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', - 'PoseLifter', 'Interhand3D' + 'PoseLifter', 'Interhand3D', 'PartAffinityField' ] diff --git a/mmpose/models/detectors/associative_embedding.py b/mmpose/models/detectors/associative_embedding.py index 30c07a66f4..078fe1cdc8 100644 --- a/mmpose/models/detectors/associative_embedding.py +++ b/mmpose/models/detectors/associative_embedding.py @@ -5,8 +5,9 @@ from mmcv.image import imwrite from mmcv.visualization.image import imshow -from mmpose.core.evaluation import (aggregate_results, get_group_preds, - get_multi_stage_outputs) +from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, get_group_preds, + split_ae_outputs) from mmpose.core.post_processing.group import HeatmapParser from mmpose.core.visualization import imshow_keypoints from .. import builder @@ -48,7 +49,6 @@ def __init__(self, self.backbone = builder.build_backbone(backbone) if keypoint_head is not None: - if 'loss_keypoint' not in keypoint_head and loss_pose is not None: warnings.warn( '`loss_pose` for BottomUp is deprecated, ' @@ -98,11 +98,11 @@ def forward(self, heatmaps height: H max_num_people: M Args: - img(torch.Tensor[NxCximgHximgW]): Input image. - targets(List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. - masks(List(torch.Tensor[NxHxW])): Masks of multi-scale target + img (torch.Tensor[NxCximgHximgW]): Input image. + targets (List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. + masks (List(torch.Tensor[NxHxW])): Masks of multi-scale target heatmaps - joints(List(torch.Tensor[NxMxKx2])): Joints of multi-scale target + joints (List(torch.Tensor[NxMxKx2])): Joints of multi-scale target heatmaps for ae loss img_metas(dict):Information about val&test By default this includes: @@ -144,13 +144,13 @@ def forward_train(self, img, targets, masks, joints, img_metas, **kwargs): max_num_people: M Args: - img(torch.Tensor[NxCximgHximgW]): Input image. - targets(List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. - masks(List(torch.Tensor[NxHxW])): Masks of multi-scale target + img (torch.Tensor[NxCximgHximgW]): Input image. + targets (List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. + masks (List(torch.Tensor[NxHxW])): Masks of multi-scale target heatmaps - joints(List(torch.Tensor[NxMxKx2])): Joints of multi-scale target + joints (List(torch.Tensor[NxMxKx2])): Joints of multi-scale target heatmaps for ae loss - img_metas(dict):Information about val&test + img_metas (dict):Information about val&test By default this includes: - "image_file": image path - "aug_data": input @@ -225,8 +225,9 @@ def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): result = {} - aggregated_heatmaps = None - tags_list = [] + scale_heatmaps_list = [] + scale_tags_list = [] + for idx, s in enumerate(sorted(test_scale_factor, reverse=True)): image_resized = aug_data[idx].to(img.device) @@ -234,47 +235,82 @@ def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): if self.with_keypoint: outputs = self.keypoint_head(features) + heatmaps, tags = split_ae_outputs(outputs, + self.test_cfg['num_joints'], + self.test_cfg['with_heatmaps'], + self.test_cfg['with_ae']) + if self.test_cfg.get('flip_test', True): # use flip test features_flipped = self.backbone( torch.flip(image_resized, [3])) if self.with_keypoint: outputs_flipped = self.keypoint_head(features_flipped) + + heatmaps_flipped, tags_flipped = split_ae_outputs( + outputs_flipped, self.test_cfg['num_joints'], + self.test_cfg['with_heatmaps'], self.test_cfg['with_ae']) + + heatmaps_flipped = flip_feature_maps( + heatmaps_flipped, flip_index=img_metas['flip_index']) + if self.test_cfg['tag_per_joint']: + tags_flipped = flip_feature_maps( + tags_flipped, flip_index=img_metas['flip_index']) + else: + tags_flipped = flip_feature_maps( + tags_flipped, flip_index=None, flip_output=True) + else: - outputs_flipped = None - - _, heatmaps, tags = get_multi_stage_outputs( - outputs, - outputs_flipped, - self.test_cfg['num_joints'], - self.test_cfg['with_heatmaps'], - self.test_cfg['with_ae'], - self.test_cfg['tag_per_joint'], - img_metas['flip_index'], - self.test_cfg['project2image'], - base_size, - align_corners=self.use_udp) - - aggregated_heatmaps, tags_list = aggregate_results( - s, - aggregated_heatmaps, - tags_list, + heatmaps_flipped = None + tags_flipped = None + + aggregated_heatmaps = aggregate_stage_flip( heatmaps, + heatmaps_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average') + + aggregated_tags = aggregate_stage_flip( tags, - test_scale_factor, - self.test_cfg['project2image'], - self.test_cfg.get('flip_test', True), - align_corners=self.use_udp) + tags_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='concat', + aggregate_flip='concat') + + if s == 1 or len(test_scale_factor) == 1: + if isinstance(aggregated_tags, list): + scale_tags_list.extend(aggregated_tags) + else: + scale_tags_list.append(aggregated_tags) + + if isinstance(aggregated_heatmaps, list): + scale_heatmaps_list.extend(aggregated_heatmaps) + else: + scale_heatmaps_list.append(aggregated_heatmaps) + + aggregated_heatmaps = aggregate_scale( + scale_heatmaps_list, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_scale='average') - # average heatmaps of different scales - aggregated_heatmaps = aggregated_heatmaps / float( - len(test_scale_factor)) - tags = torch.cat(tags_list, dim=4) + aggregated_tags = aggregate_scale( + scale_tags_list, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_scale='unsqueeze_concat') # perform grouping - grouped, scores = self.parser.parse(aggregated_heatmaps, tags, + grouped, scores = self.parser.parse(aggregated_heatmaps, + aggregated_tags, self.test_cfg['adjust'], - self.test_cfg['refine']) + self.test_cfg['refine'], + self.test_cfg.get('filter', False)) preds = get_group_preds( grouped, diff --git a/mmpose/models/detectors/paf.py b/mmpose/models/detectors/paf.py new file mode 100644 index 0000000000..2d06fb41f8 --- /dev/null +++ b/mmpose/models/detectors/paf.py @@ -0,0 +1,400 @@ +import warnings + +import mmcv +import torch +from mmcv.image import imwrite +from mmcv.visualization.image import imshow + +from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, + flip_part_affinity_fields, get_group_preds) +from mmpose.core.post_processing.group import PAFParser +from mmpose.core.visualization import imshow_keypoints +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class PartAffinityField(BasePose): + """Bottom-up PAF (part affinity field) pose detectors. + + Paper ref: Cao, Zhe, et al. "OpenPose: realtime multi-person + 2D pose estimation using Part Affinity Fields." (TPAMI'2019) + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + """ + + def __init__(self, + backbone, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + if keypoint_head is not None: + if 'loss_keypoint' not in keypoint_head and loss_pose is not None: + warnings.warn( + '`loss_pose` for BottomUp is deprecated, ' + 'use `loss_keypoint` for heads instead. See ' + 'https://github.com/open-mmlab/mmpose/pull/382' + ' for more information.', DeprecationWarning) + keypoint_head['loss_keypoint'] = loss_pose + + self.keypoint_head = builder.build_head(keypoint_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.use_udp = test_cfg.get('use_udp', False) + self.parser = PAFParser(self.test_cfg) + self.init_weights(pretrained=pretrained) + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_keypoint: + self.keypoint_head.init_weights() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img=None, + targets=None, + masks=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss is True. + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps weight: W + heatmaps height: H + max_num_people: M + Args: + img(torch.Tensor[NxCximgHximgW]): Input image. + targets (list(list)): List of heatmaps + and pafs, each of which multi-scale targets. + masks (list(list(torch.Tensor[NxHxW]))): Masks of multi-scale + target heatmaps. + img_metas(dict):Information about val&test + By default this includes: + - "image_file": image path + - "aug_data": input + - "test_scale_factor": test scale factor + - "base_size": base size of input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + + return loss(bool): Option to 'return_loss'. 'return_loss=True' for + training, 'return_loss=False' for validation & test + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if 'return_loss' is true, then return losses. + Otherwise, return predicted poses, scores, image + paths and heatmaps. + """ + + if return_loss: + return self.forward_train(img, targets, masks, img_metas, **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, targets, masks, img_metas, **kwargs): + """Forward the bottom-up model and calculate the loss. + + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps weight: W + heatmaps height: H + max_num_people: M + + Args: + img (torch.Tensor[NxCximgHximgW]): Input image. + targets (list(list)): List of heatmaps + and pafs, each of which multi-scale targets. + masks (list(list(torch.Tensor[NxHxW]))): Masks of multi-scale + target heatmaps. + img_metas (dict):Information about val&test + By default this includes: + - "image_file": image path + - "aug_data": input + - "test_scale_factor": test scale factor + - "base_size": base size of input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + + Returns: + dict: The total loss for bottom-up + """ + + output = self.backbone(img) + + if self.with_keypoint: + output = self.keypoint_head(output) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, targets, masks) + losses.update(keypoint_losses) + + return losses + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Outputs. + """ + output = self.backbone(img) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Inference the bottom-up model. + + Note: + Batchsize = N (currently support batchsize = 1) + num_img_channel: C + img_width: imgW + img_height: imgH + + Args: + flip_index (List(int)): + aug_data (List(Tensor[NxCximgHximgW])): Multi-scale image + test_scale_factor (List(float)): Multi-scale factor + base_size (Tuple(int)): Base size of image when scale is 1 + center (np.ndarray): center of image + scale (np.ndarray): the scale of image + """ + assert img.size(0) == 1 + assert len(img_metas) == 1 + + img_metas = img_metas[0] + + aug_data = img_metas['aug_data'] + + test_scale_factor = img_metas['test_scale_factor'] + base_size = img_metas['base_size'] + center = img_metas['center'] + scale = img_metas['scale'] + + result = {} + + scale_heatmaps_list = [] + scale_pafs_list = [] + + for idx, s in enumerate(sorted(test_scale_factor, reverse=True)): + image_resized = aug_data[idx].to(img.device) + + features = self.backbone(image_resized) + if self.with_keypoint: + outputs = self.keypoint_head(features) + # ignore back-ground confidence maps + heatmaps = [ + hm[:, :-1] if self.test_cfg['with_bg'] else hm + for hm in outputs['heatmaps'][-1] + ] + pafs = outputs['pafs'][-1] + + if self.test_cfg.get('flip_test', True): + # use flip test + features_flipped = self.backbone( + torch.flip(image_resized, [3])) + if self.with_keypoint: + outputs_flipped = self.keypoint_head(features_flipped) + # ignore back-ground confidence maps + heatmaps_flipped = [ + hm[:, :-1] if self.test_cfg['with_bg'] else hm + for hm in outputs_flipped['heatmaps'][-1] + ] + pafs_flipped = outputs_flipped['pafs'][-1] + + heatmaps_flipped = flip_feature_maps( + heatmaps_flipped, flip_index=img_metas['flip_index']) + pafs_flipped = flip_part_affinity_fields( + pafs_flipped, + flip_index=img_metas['flip_index'], + skeleton=img_metas['skeleton']) + + else: + heatmaps_flipped = None + pafs_flipped = None + + aggregated_heatmaps = aggregate_stage_flip( + heatmaps, + heatmaps_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average') + + aggregated_pafs = aggregate_stage_flip( + pafs, + pafs_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average') + + if isinstance(aggregated_pafs, list): + scale_pafs_list.extend(aggregated_pafs) + else: + scale_pafs_list.append(aggregated_pafs) + + if isinstance(aggregated_heatmaps, list): + scale_heatmaps_list.extend(aggregated_heatmaps) + else: + scale_heatmaps_list.append(aggregated_heatmaps) + + # average heatmaps of different scales + aggregated_heatmaps = aggregate_scale( + scale_heatmaps_list, + aggregate_scale='average', + align_corners=self.test_cfg.get('align_corners', True)) + + aggregated_pafs = aggregate_scale( + scale_pafs_list, + aggregate_scale='average', + align_corners=self.test_cfg.get('align_corners', True)) + + # perform grouping + grouped, scores = self.parser.parse(aggregated_heatmaps, + aggregated_pafs, + img_metas['skeleton'], + self.test_cfg['adjust'], + self.test_cfg['refine'], + self.test_cfg.get('filter', False)) + + preds = get_group_preds( + grouped, + center, + scale, [aggregated_heatmaps.size(3), + aggregated_heatmaps.size(2)], + use_udp=self.use_udp) + + image_paths = [] + image_paths.append(img_metas['image_file']) + + if return_heatmap: + output_heatmap = aggregated_heatmaps.detach().cpu().numpy() + else: + output_heatmap = None + + result['preds'] = preds + result['scores'] = scores + result['image_paths'] = image_paths + result['output_heatmap'] = output_heatmap + + return result + + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color=None, + pose_kpt_color=None, + pose_limb_color=None, + radius=4, + thickness=1, + font_scale=0.5, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_limb_color (np.array[Mx3]): Color of M limbs. + If None, do not draw limbs. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized image only if not `show` or `out_file` + """ + + img = mmcv.imread(img) + img = img.copy() + img_h, img_w, _ = img.shape + + pose_result = [] + for res in result: + pose_result.append(res['keypoints']) + + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_limb_color, radius, thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py index 5f4da4484b..fa9727dae2 100644 --- a/mmpose/models/heads/__init__.py +++ b/mmpose/models/heads/__init__.py @@ -1,8 +1,10 @@ from .ae_higher_resolution_head import AEHigherResolutionHead from .ae_simple_head import AESimpleHead +from .deconv_head import DeconvHead from .deeppose_regression_head import DeepposeRegressionHead from .hmr_head import HMRMeshHead from .interhand_3d_head import Interhand3DHead +from .paf_head import PAFHead from .temporal_regression_head import TemporalRegressionHead from .topdown_heatmap_base_head import TopdownHeatmapBaseHead from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead, @@ -13,5 +15,6 @@ 'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead', 'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead', 'AEHigherResolutionHead', 'AESimpleHead', 'DeepposeRegressionHead', - 'TemporalRegressionHead', 'Interhand3DHead', 'HMRMeshHead' + 'TemporalRegressionHead', 'Interhand3DHead', 'HMRMeshHead', 'PAFHead', + 'DeconvHead' ] diff --git a/mmpose/models/heads/ae_higher_resolution_head.py b/mmpose/models/heads/ae_higher_resolution_head.py index a4c7a55036..fc510f9f86 100644 --- a/mmpose/models/heads/ae_higher_resolution_head.py +++ b/mmpose/models/heads/ae_higher_resolution_head.py @@ -171,7 +171,7 @@ def _get_deconv_cfg(deconv_kernel): return deconv_kernel, padding, output_padding - def get_loss(self, output, targets, masks, joints): + def get_loss(self, outputs, targets, masks, joints): """Calculate bottom-up keypoint loss. Note: @@ -182,18 +182,18 @@ def get_loss(self, output, targets, masks, joints): heatmaps weight: W Args: - output (torch.Tensor[NxKxHxW]): Output heatmaps. - targets(List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. - masks(List(torch.Tensor[NxHxW])): Masks of multi-scale target - heatmaps - joints(List(torch.Tensor[NxMxKx2])): Joints of multi-scale target - heatmaps for ae loss + outputs (List(torch.Tensor[NxKxHxW])): Multi-scale output heatmaps. + targets (List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. + masks (List(torch.Tensor[NxHxW])): Masks of multi-scale target + heatmaps + joints (List(torch.Tensor[NxMxKx2])): Joints of multi-scale target + heatmaps for ae loss """ losses = dict() heatmaps_losses, push_losses, pull_losses = self.loss( - output, targets, masks, joints) + outputs, targets, masks, joints) for idx in range(len(targets)): if heatmaps_losses[idx] is not None: diff --git a/mmpose/models/heads/ae_simple_head.py b/mmpose/models/heads/ae_simple_head.py index 6b90eecd3e..058e2303e6 100644 --- a/mmpose/models/heads/ae_simple_head.py +++ b/mmpose/models/heads/ae_simple_head.py @@ -1,13 +1,9 @@ -import torch.nn as nn -from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init, - normal_init) - -from mmpose.models.builder import build_loss from ..builder import HEADS +from .deconv_head import DeconvHead @HEADS.register_module() -class AESimpleHead(nn.Module): +class AESimpleHead(DeconvHead): """Associative embedding simple head. paper ref: Alejandro Newell et al. "Associative Embedding: End-to-end Learning for Joint Detection @@ -39,53 +35,23 @@ def __init__(self, with_ae_loss=None, extra=None, loss_keypoint=None): - super().__init__() - - self.loss = build_loss(loss_keypoint) - self.in_channels = in_channels dim_tag = num_joints if tag_per_joint else 1 if with_ae_loss[0]: out_channels = num_joints + dim_tag else: out_channels = num_joints - if extra is not None and not isinstance(extra, dict): - raise TypeError('extra should be dict or None.') - - if num_deconv_layers > 0: - self.deconv_layers = self._make_deconv_layer( - num_deconv_layers, - num_deconv_filters, - num_deconv_kernels, - ) - elif num_deconv_layers == 0: - self.deconv_layers = nn.Identity() - else: - raise ValueError( - f'num_deconv_layers ({num_deconv_layers}) should >= 0.') - - if extra is not None and 'final_conv_kernel' in extra: - assert extra['final_conv_kernel'] in [1, 3] - if extra['final_conv_kernel'] == 3: - padding = 1 - else: - padding = 0 - kernel_size = extra['final_conv_kernel'] - else: - kernel_size = 1 - padding = 0 - - self.final_layer = build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=num_deconv_filters[-1] - if num_deconv_layers > 0 else in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=1, - padding=padding) + super().__init__( + in_channels, + out_channels, + num_deconv_layers=num_deconv_layers, + num_deconv_filters=num_deconv_filters, + num_deconv_kernels=num_deconv_kernels, + extra=extra, + loss_keypoint=loss_keypoint) - def get_loss(self, output, targets, masks, joints): + def get_loss(self, outputs, targets, masks, joints): """Calculate bottom-up keypoint loss. Note: @@ -96,18 +62,18 @@ def get_loss(self, output, targets, masks, joints): heatmaps weight: W Args: - output (torch.Tensor[NxKxHxW]): Output heatmaps. - targets(List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. - masks(List(torch.Tensor[NxHxW])): Masks of multi-scale target - heatmaps + outputs (list(torch.Tensor[NxKxHxW])): Multi-scale output heatmaps. + targets (List(torch.Tensor[NxKxHxW])): Multi-scale target heatmaps. + masks (List(torch.Tensor[NxHxW])): Masks of multi-scale target + heatmaps joints(List(torch.Tensor[NxMxKx2])): Joints of multi-scale target - heatmaps for ae loss + heatmaps for ae loss """ losses = dict() heatmaps_losses, push_losses, pull_losses = self.loss( - output, targets, masks, joints) + outputs, targets, masks, joints) for idx in range(len(targets)): if heatmaps_losses[idx] is not None: @@ -130,74 +96,3 @@ def get_loss(self, output, targets, masks, joints): losses['pull_loss'] += pull_loss return losses - - def forward(self, x): - """Forward function.""" - if isinstance(x, list): - x = x[0] - final_outputs = [] - x = self.deconv_layers(x) - y = self.final_layer(x) - final_outputs.append(y) - return final_outputs - - def _make_deconv_layer(self, num_layers, num_filters, num_kernels): - """Make deconv layers.""" - if num_layers != len(num_filters): - error_msg = f'num_layers({num_layers}) ' \ - f'!= length of num_filters({len(num_filters)})' - raise ValueError(error_msg) - if num_layers != len(num_kernels): - error_msg = f'num_layers({num_layers}) ' \ - f'!= length of num_kernels({len(num_kernels)})' - raise ValueError(error_msg) - - layers = [] - for i in range(num_layers): - kernel, padding, output_padding = \ - self._get_deconv_cfg(num_kernels[i]) - - planes = num_filters[i] - layers.append( - build_upsample_layer( - dict(type='deconv'), - in_channels=self.in_channels, - out_channels=planes, - kernel_size=kernel, - stride=2, - padding=padding, - output_padding=output_padding, - bias=False)) - layers.append(nn.BatchNorm2d(planes)) - layers.append(nn.ReLU(inplace=True)) - self.in_channels = planes - - return nn.Sequential(*layers) - - @staticmethod - def _get_deconv_cfg(deconv_kernel): - """Get configurations for deconv layers.""" - if deconv_kernel == 4: - padding = 1 - output_padding = 0 - elif deconv_kernel == 3: - padding = 1 - output_padding = 1 - elif deconv_kernel == 2: - padding = 0 - output_padding = 0 - else: - raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') - - return deconv_kernel, padding, output_padding - - def init_weights(self): - """Initialize model weights.""" - for _, m in self.deconv_layers.named_modules(): - if isinstance(m, nn.ConvTranspose2d): - normal_init(m, std=0.001) - elif isinstance(m, nn.BatchNorm2d): - constant_init(m, 1) - for m in self.final_layer.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001, bias=0) diff --git a/mmpose/models/heads/deconv_head.py b/mmpose/models/heads/deconv_head.py new file mode 100644 index 0000000000..5f1fd79f9e --- /dev/null +++ b/mmpose/models/heads/deconv_head.py @@ -0,0 +1,292 @@ +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, + constant_init, normal_init) + +from mmpose.models.builder import HEADS, build_loss +from mmpose.models.utils.ops import resize + + +@HEADS.register_module() +class DeconvHead(nn.Module): + """Simple deconv head. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + in_index (int|Sequence[int]): Input feature index. Default: 0 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resized to the + same size as the first one and then concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + Default: None. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_keypoint (dict): Config for loss. Default: None. + """ + + def __init__(self, + in_channels=3, + out_channels=17, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + in_index=0, + input_transform=None, + align_corners=False, + loss_keypoint=None): + super().__init__() + + self.in_channels = in_channels + self.loss = build_loss(loss_keypoint) + + self._init_inputs(in_channels, in_index, input_transform) + self.in_index = in_index + self.align_corners = align_corners + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + build_conv_layer( + dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append( + build_norm_layer(dict(type='BN'), conv_channels)[1]) + layers.append(nn.ReLU(inplace=True)) + + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding)) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform is not None, in_channels and in_index must be + list or tuple, with the same length. + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor] | Tensor): multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(inputs, list): + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def get_loss(self, outputs, targets, masks): + """Calculate bottom-up masked mse loss. + + Note: + batch_size: N + num_channels: C + heatmaps height: H + heatmaps weight: W + + Args: + outputs (List(torch.Tensor[NxCxHxW])): Multi-scale outputs. + targets (List(torch.Tensor[NxCxHxW])): Multi-scale targets. + masks (List(torch.Tensor[NxHxW])): Masks of multi-scale targets. + """ + + losses = dict() + + for idx in range(len(targets)): + if 'loss' not in losses: + losses['loss'] = self.loss(outputs[idx], targets[idx], + masks[idx]) + else: + losses['loss'] += self.loss(outputs[idx], targets[idx], + masks[idx]) + + return losses + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + final_outputs = [] + x = self.deconv_layers(x) + y = self.final_layer(x) + final_outputs.append(y) + return final_outputs + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) diff --git a/mmpose/models/heads/paf_head.py b/mmpose/models/heads/paf_head.py new file mode 100644 index 0000000000..f652df9729 --- /dev/null +++ b/mmpose/models/heads/paf_head.py @@ -0,0 +1,113 @@ +import torch.nn as nn + +from mmpose.models.builder import HEADS, build_head + + +@HEADS.register_module() +class PAFHead(nn.Module): + """Bottom-up PAF (Part Affinity Fields) head. + + Paper ref: Cao, Zhe, et al. "OpenPose: realtime multi-person + 2D pose estimation using Part Affinity Fields." (TPAMI'2019) + + Args: + heatmap_heads_cfg (list(dict)): Configs of heatmap heads. + paf_heads_cfg (list(dict)): Configs of paf heads. + heatmap_index (list(int)): The correspondence between heatmap heads + and input features. + paf_index (list(int)): The correspondence between paf heads + and input features. + """ + + def __init__(self, heatmap_heads_cfg, paf_heads_cfg, heatmap_index, + paf_index): + super().__init__() + + assert len(heatmap_heads_cfg) == len(heatmap_index) + assert len(paf_heads_cfg) == len(paf_index) + + # build heatmap heads + self.heatmap_heads_list = nn.ModuleList() + for head_cfg in heatmap_heads_cfg: + self.heatmap_heads_list.append(build_head(head_cfg)) + + # build paf heads + self.paf_heads_list = nn.ModuleList() + for head_cfg in paf_heads_cfg: + self.paf_heads_list.append(build_head(head_cfg)) + + self.heatmap_index = heatmap_index + self.paf_index = paf_index + + def get_loss(self, outputs, targets, masks): + """Calculate heatmap and paf loss. + + Note: + batch_size: N + num_channels: C + heatmaps height: H + heatmaps weight: W + + Args: + outputs (dict): Outputs of network, including heatmaps and pafs. + targets (list(list)): List of heatmaps + and pafs, each of which multi-scale targets. + masks (list(list(torch.Tensor[NxHxW]))): Masks of multi-scale + target heatmaps. + """ + + losses = dict() + + heatmap_outputs = outputs['heatmaps'] + heatmap_targets = targets[:len(self.heatmap_heads_list)] + heatmap_masks = masks[:len(self.heatmap_heads_list)] + for idx, head in enumerate(self.heatmap_heads_list): + heatmap_losses = head.get_loss(heatmap_outputs[idx], + heatmap_targets[idx], + heatmap_masks[idx]) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = heatmap_losses['loss'] + else: + losses['heatmap_loss'] += heatmap_losses['loss'] + + paf_outputs = outputs['pafs'] + paf_targets = targets[len(self.heatmap_heads_list):] + paf_masks = masks[len(self.heatmap_heads_list):] + for idx, head in enumerate(self.paf_heads_list): + paf_losses = head.get_loss(paf_outputs[idx], paf_targets[idx], + paf_masks[idx]) + if 'paf_loss' not in losses: + losses['paf_loss'] = paf_losses['loss'] + else: + losses['paf_loss'] += paf_losses['loss'] + + return losses + + def forward(self, x): + """Forward function.""" + if not isinstance(x, list): + x = [x] + + assert max(self.heatmap_index) < len(x) + assert max(self.paf_index) < len(x) + + final_outputs = {'heatmaps': [], 'pafs': []} + + for idx, head in enumerate(self.heatmap_heads_list): + features = x[self.heatmap_index[idx]] + output = head(features) + final_outputs['heatmaps'].append(output) + + for idx, head in enumerate(self.paf_heads_list): + features = x[self.paf_index[idx]] + output = head(features) + final_outputs['pafs'].append(output) + + return final_outputs + + def init_weights(self): + for head in self.heatmap_heads_list: + head.init_weights() + + for head in self.paf_heads_list: + head.init_weights() diff --git a/mmpose/models/heads/topdown_heatmap_simple_head.py b/mmpose/models/heads/topdown_heatmap_simple_head.py index 6f253fede8..7cf02e1d97 100644 --- a/mmpose/models/heads/topdown_heatmap_simple_head.py +++ b/mmpose/models/heads/topdown_heatmap_simple_head.py @@ -28,11 +28,11 @@ class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead): num_deconv_filters (list|tuple): Number of filters. If num_deconv_layers > 0, the length of num_deconv_kernels (list|tuple): Kernel sizes. - in_index (int|Sequence[int]): Input feature index. Default: -1 + in_index (int|Sequence[int]): Input feature index. Default: 0 input_transform (str|None): Transformation type of input features. Options: 'resize_concat', 'multiple_select', None. - 'resize_concat': Multiple feature maps will be resize to the - same size as first one and than concat together. + 'resize_concat': Multiple feature maps will be resized to the + same size as the first one and then concat together. Usually used in FCN head of HRNet. 'multiple_select': Multiple feature maps will be bundle into a list and passed into decode head. diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py index 4232d13590..910d97fcf5 100644 --- a/mmpose/models/losses/mse_loss.py +++ b/mmpose/models/losses/mse_loss.py @@ -150,3 +150,40 @@ def forward(self, output, target, target_weight): losses = torch.cat(losses, dim=1) return self._ohkm(losses) * self.loss_weight + + +@LOSSES.register_module() +class MaskedMSELoss(nn.Module): + """MSE loss for the bottom-up outputs with mask. + + Args: + use_mask (bool): Option to use mask of target. Default: True. + loss_weight (float): Weight of the loss. Default: 1.0. + supervise_empty (bool): Whether to supervise empty channels. + """ + + def __init__(self, use_mask=True, loss_weight=1., supervise_empty=True): + super().__init__() + self.criterion = nn.MSELoss() + self.use_mask = use_mask + self.loss_weight = loss_weight + self.supervise_empty = supervise_empty + + def forward(self, output, target, mask): + """Forward function.""" + assert output.size() == target.size() + + if self.use_mask: + loss = self.criterion( + output, target) * mask[:, None, :, :].expand_as(output) + # if not self.supervise_empty: + # empty_mask = (target.sum(dim=[2, 3], keepdim=True) > 0).float() + # mask = empty_mask.expand_as( + # output) * mask[:, None, :, :].expand_as(output) + # else: + # mask = mask[:, None, :, :].expand_as(output) + # loss = self.criterion(output * mask, target * mask) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight diff --git a/tests/test_backbones/test_cpm.py b/tests/test_backbones/test_cpm.py index 3a9481539a..dc6dc45dea 100644 --- a/tests/test_backbones/test_cpm.py +++ b/tests/test_backbones/test_cpm.py @@ -2,6 +2,24 @@ import torch from mmpose.models import CPM +from mmpose.models.backbones.cpm import CpmBlock + + +def test_cpm_block(): + with pytest.raises(AssertionError): + # len(channels) == len(kernels) + CpmBlock( + 3, channels=[3, 3, 3], kernels=[ + 1, + ]) + + # Test CPM Block + model = CpmBlock(3, channels=[3, 3, 3], kernels=[1, 1, 1]) + model.train() + + imgs = torch.randn(1, 3, 10, 10) + feat = model(imgs) + assert feat.shape == torch.Size([1, 3, 10, 10]) def test_cpm_backbone(): diff --git a/tests/test_backbones/test_lightweight_openpose.py b/tests/test_backbones/test_lightweight_openpose.py new file mode 100644 index 0000000000..18ac28c300 --- /dev/null +++ b/tests/test_backbones/test_lightweight_openpose.py @@ -0,0 +1,25 @@ +import pytest +import torch + +from mmpose.models import LightweightOpenPoseNetwork + + +def test_lightweight_openpose_network_backbone(): + with pytest.raises(AssertionError): + # OpenPoseNetwork's num_stacks should larger than 0 + LightweightOpenPoseNetwork(in_channels=3, num_stages=-1) + + with pytest.raises(AssertionError): + # OpenPoseNetwork's in_channels should be 3 + LightweightOpenPoseNetwork(in_channels=2) + + # Test OpenPoseNetwork + model = LightweightOpenPoseNetwork(in_channels=3) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 368, 368) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 19, 46, 46]) + assert feat[-1].shape == torch.Size([1, 38, 46, 46]) diff --git a/tests/test_backbones/test_openpose_v1.py b/tests/test_backbones/test_openpose_v1.py new file mode 100644 index 0000000000..1599fd58d4 --- /dev/null +++ b/tests/test_backbones/test_openpose_v1.py @@ -0,0 +1,25 @@ +import pytest +import torch + +from mmpose.models import OpenPoseNetworkV1 + + +def test_openpose_network_v1_backbone(): + with pytest.raises(AssertionError): + # OpenPoseNetwork's num_stacks should larger than 0 + OpenPoseNetworkV1(in_channels=3, num_stages=-1) + + with pytest.raises(AssertionError): + # OpenPoseNetwork's in_channels should be 3 + OpenPoseNetworkV1(in_channels=2) + + # Test OpenPoseNetwork + model = OpenPoseNetworkV1(in_channels=3) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 368, 368) + feat = model(imgs) + assert len(feat) == 12 + assert feat[0].shape == torch.Size([1, 19, 46, 46]) + assert feat[-1].shape == torch.Size([1, 38, 46, 46]) diff --git a/tests/test_backbones/test_openpose_v2.py b/tests/test_backbones/test_openpose_v2.py new file mode 100644 index 0000000000..0af920acb2 --- /dev/null +++ b/tests/test_backbones/test_openpose_v2.py @@ -0,0 +1,38 @@ +import pytest +import torch + +from mmpose.models import OpenPoseNetworkV2 + + +def test_openpose_network_v2_backbone(): + with pytest.raises(AssertionError): + # OpenPoseNetwork's num_stacks should larger than 0 + OpenPoseNetworkV2(in_channels=3, num_stages=-1) + + with pytest.raises(AssertionError): + # OpenPoseNetwork's in_channels should be 3 + OpenPoseNetworkV2(in_channels=2) + + with pytest.raises(AssertionError): + # len(stage_types) == num_stages + OpenPoseNetworkV2( + in_channels=3, num_stages=3, stage_types=('PAF', 'CM')) + + with pytest.raises(ValueError): + # stage_type should be either 'CM' or 'PAF'. + OpenPoseNetworkV2( + in_channels=3, num_stages=2, stage_types=('PAF', 'CC')) + + # Test OpenPoseNetwork + model = OpenPoseNetworkV2(in_channels=3) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 368, 368) + feat = model(imgs) + assert len(feat) == 6 + assert feat[0].shape == torch.Size([1, 38, 46, 46]) + assert feat[-1].shape == torch.Size([1, 19, 46, 46]) + + +test_openpose_network_v2_backbone() diff --git a/tests/test_evaluation/test_bottom_up_eval.py b/tests/test_evaluation/test_bottom_up_eval.py index bcb3788ade..9993219880 100644 --- a/tests/test_evaluation/test_bottom_up_eval.py +++ b/tests/test_evaluation/test_bottom_up_eval.py @@ -1,117 +1,96 @@ -import copy - import numpy as np +import pytest import torch -from mmpose.core import (aggregate_results, get_group_preds, - get_multi_stage_outputs) +from mmpose.core import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, flip_part_affinity_fields, + get_group_preds, split_ae_outputs) + + +def test_split_ae_outputs(): + fake_outputs = [torch.zeros((1, 4, 2, 2))] + heatmaps, tags = split_ae_outputs( + fake_outputs, num_joints=4, with_heatmaps=[False], with_ae=[True]) + + +def test_flip_feature_maps(): + fake_outputs = [torch.zeros((1, 4, 2, 2))] + _ = flip_feature_maps(fake_outputs, None) + _ = flip_feature_maps(fake_outputs, flip_index=[1, 0]) -def test_get_multi_stage_outputs(): +def test_flip_part_affinity_fields(): + fake_outputs = [torch.zeros((1, 4, 2, 2))] + + _ = flip_part_affinity_fields(fake_outputs, None, skeleton=[]) + _ = flip_part_affinity_fields( + fake_outputs, flip_index=[1, 0], skeleton=[[0, 1]]) + + with pytest.raises(ValueError): + _ = flip_part_affinity_fields( + fake_outputs, flip_index=[1, 0], skeleton=[[0, 0]]) + + +def test_aggregate_stage_flip(): fake_outputs = [torch.zeros((1, 4, 2, 2))] fake_flip_outputs = [torch.ones((1, 4, 2, 2))] - # outputs_flip - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=None, - num_joints=4, with_heatmaps=[False], - with_ae=[True]) - assert heatmaps == [] - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=None, - num_joints=2, with_heatmaps=[True], - with_ae=[True]) - assert len(heatmaps) == 1 - flip_index = [1, 0] - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=fake_flip_outputs, - num_joints=2, with_heatmaps=[True], - with_ae=[True], flip_index=flip_index) - assert len(heatmaps) == 2 - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - tag_per_joint=False, - outputs_flip=fake_flip_outputs, - num_joints=2, with_heatmaps=[True], - with_ae=[True], flip_index=flip_index) - assert len(heatmaps) == 2 - # with heatmaps & with ae - fake_outputs = [torch.zeros((1, 4, 2, 2)), torch.ones((1, 2, 4, 4))] - fake_flip_outputs = [torch.ones((1, 4, 2, 2)), torch.ones((1, 2, 4, 4))] - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=None, - num_joints=2, with_heatmaps=[True, False], - with_ae=[True, True]) - assert torch.allclose(heatmaps[0], torch.tensor(0.)) - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=fake_flip_outputs, - num_joints=2, with_heatmaps=[True, True], - with_ae=[True, False]) - assert torch.allclose(heatmaps[0], torch.tensor(0.5)) - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=fake_flip_outputs, - num_joints=2, with_heatmaps=[True, False], - with_ae=[True, False], flip_index=flip_index) - assert torch.allclose(heatmaps[0], torch.tensor(0.)) - # size_projected - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=None, - num_joints=2, with_heatmaps=[True, True], - with_ae=[True, False], - size_projected=(8, 8)) - assert heatmaps[0].shape == torch.Size([1, 2, 8, 8]) - outputs, heatmaps, tags = \ - get_multi_stage_outputs(outputs=copy.deepcopy(fake_outputs), - outputs_flip=fake_flip_outputs, - num_joints=2, with_heatmaps=[True, True], - with_ae=[True, False], - align_corners=True) - assert torch.allclose(heatmaps[0], torch.tensor(0.5)) - - -def test_aggregate_results(): - fake_heatmaps = [torch.zeros((1, 2, 2, 2))] - fake_tags = [torch.zeros((1, 2, 2, 2))] - aggregated_heatmaps, tags_list = \ - aggregate_results(scale=1, aggregated_heatmaps=None, tags_list=[], - heatmaps=fake_heatmaps, tags=fake_tags, - test_scale_factor=[1], project2image=True, - flip_test=False) - assert torch.allclose(aggregated_heatmaps, torch.tensor(0.)) - fake_aggr_heatmaps = torch.ones(1, 2, 2, 2) - aggregated_heatmaps, tags_list = \ - aggregate_results(scale=1, aggregated_heatmaps=fake_aggr_heatmaps, - tags_list=[], heatmaps=fake_heatmaps, - tags=fake_tags, test_scale_factor=[1], - project2image=True, flip_test=False) - assert torch.allclose(aggregated_heatmaps, torch.tensor(1.)) - aggregated_heatmaps, tags_list = \ - aggregate_results(scale=1, aggregated_heatmaps=fake_aggr_heatmaps, - tags_list=[], heatmaps=fake_heatmaps, - tags=fake_tags, test_scale_factor=[1], - project2image=True, flip_test=False, - align_corners=True) - assert torch.allclose(aggregated_heatmaps, torch.tensor(1.)) - fake_heatmaps = [torch.zeros((1, 2, 2, 2)), torch.ones((1, 2, 2, 2))] - fake_aggr_heatmaps = torch.ones(1, 2, 4, 4) - aggregated_heatmaps, tags_list = \ - aggregate_results(scale=1, aggregated_heatmaps=fake_aggr_heatmaps, - tags_list=[], heatmaps=fake_heatmaps, - tags=fake_tags, test_scale_factor=[1], - project2image=False, flip_test=True) - assert aggregated_heatmaps.shape == torch.Size((1, 2, 4, 4)) - aggregated_heatmaps, tags_list = \ - aggregate_results(scale=2, aggregated_heatmaps=fake_aggr_heatmaps, - tags_list=[], heatmaps=fake_heatmaps, - tags=fake_tags, test_scale_factor=[1, 2], - project2image=False, flip_test=True) - assert aggregated_heatmaps.shape == torch.Size((1, 2, 4, 4)) + output = aggregate_stage_flip( + fake_outputs, + fake_flip_outputs, + index=-1, + project2image=True, + size_projected=(4, 4), + align_corners=False, + aggregate_stage='concat', + aggregate_flip='average') + assert isinstance(output, list) + + output = aggregate_stage_flip( + fake_outputs, + fake_flip_outputs, + index=-1, + project2image=True, + size_projected=(4, 4), + align_corners=False, + aggregate_stage='average', + aggregate_flip='average') + assert isinstance(output, list) + + output = aggregate_stage_flip( + fake_outputs, + fake_flip_outputs, + index=-1, + project2image=True, + size_projected=(4, 4), + align_corners=False, + aggregate_stage='average', + aggregate_flip='concat') + assert isinstance(output, list) + + output = aggregate_stage_flip( + fake_outputs, + fake_flip_outputs, + index=-1, + project2image=True, + size_projected=(4, 4), + align_corners=False, + aggregate_stage='concat', + aggregate_flip='concat') + assert isinstance(output, list) + + +def test_aggregate_scale(): + fake_outputs = [torch.zeros((1, 4, 2, 2)), torch.zeros((1, 4, 2, 2))] + output = aggregate_scale( + fake_outputs, align_corners=False, aggregate_scale='average') + assert isinstance(output, torch.Tensor) + assert output.shape == fake_outputs[0].shape + + output = aggregate_scale( + fake_outputs, align_corners=False, aggregate_scale='unsqueeze_concat') + + assert isinstance(output, torch.Tensor) + assert len(output.shape) == len(fake_outputs[0].shape) + 1 def test_get_group_preds(): diff --git a/tests/test_model/test_bottom_up_forward.py b/tests/test_model/test_bottom_up_forward.py index c72bf8601a..a8ffe4617c 100644 --- a/tests/test_model/test_bottom_up_forward.py +++ b/tests/test_model/test_bottom_up_forward.py @@ -4,7 +4,7 @@ from mmpose.models.detectors import AssociativeEmbedding -def test_bottomup_forward(): +def test_ae_forward(): model_cfg = dict( type='AssociativeEmbedding', pretrained=None, diff --git a/tests/test_model/test_bottom_up_head.py b/tests/test_model/test_bottom_up_head.py index 41b2f4ef1e..df9b0093ba 100644 --- a/tests/test_model/test_bottom_up_head.py +++ b/tests/test_model/test_bottom_up_head.py @@ -31,7 +31,7 @@ def test_ae_simple_head(): in_channels=512, num_joints=17, with_ae_loss=[True], - extra={'final_conv_kernel': 0}, + extra={'final_conv_kernel': -1}, loss_keypoint=dict( type='MultiLossFactory', num_joints=17, diff --git a/tests/test_pipelines/test_bottom_up_pipelines.py b/tests/test_pipelines/test_bottom_up_pipelines.py index ec855a4b79..b1b14eea36 100644 --- a/tests/test_pipelines/test_bottom_up_pipelines.py +++ b/tests/test_pipelines/test_bottom_up_pipelines.py @@ -276,15 +276,15 @@ def test_BottomUpGenerateHeatmapTarget(): generate_heatmap_target = BottomUpGenerateHeatmapTarget(2) results_generate_heatmap_target = generate_heatmap_target(results) - assert 'target' in results_generate_heatmap_target - assert len(results_generate_heatmap_target['target'] + assert 'targets' in results_generate_heatmap_target + assert len(results_generate_heatmap_target['targets'] ) == results['ann_info']['num_scales'] def test_BottomUpGeneratePAFTarget(): ann_info = {} - ann_info['skeleton'] = [[1, 2], [3, 4]] + ann_info['skeleton'] = [[0, 1], [2, 3]] ann_info['heatmap_size'] = np.array([5]) ann_info['num_joints'] = 4 ann_info['num_scales'] = 1 @@ -305,7 +305,7 @@ def test_BottomUpGeneratePAFTarget(): generate_paf_target = BottomUpGeneratePAFTarget(1) results_generate_paf_target = generate_paf_target(results) sqrt = np.sqrt(2) / 2 - assert (results_generate_paf_target['target'] == np.array( + assert (results_generate_paf_target['targets'] == np.array( [[[sqrt, sqrt, 0, sqrt, sqrt], [sqrt, sqrt, sqrt, sqrt, sqrt], [0, sqrt, sqrt, sqrt, 0], [sqrt, sqrt, sqrt, sqrt, sqrt], [sqrt, sqrt, 0, sqrt, sqrt]],