Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Add ViPNAS mbv3 #1025

Merged
merged 9 commits into from
Nov 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da

| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
| [S-ViPNAS-Res50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py) | 256x192 | 0.711 | 0.893 | 0.789 | 0.769 | 0.769 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) |
| [S-ViPNAS-MobileNetV3](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py) | 256x192 | 0.700 | 0.887 | 0.778 | 0.757 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192-7018731a_20211122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192_20211122.log.json) |
| [S-ViPNAS-Res50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py) | 256x192 | 0.711 | 0.893 | 0.789 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) |
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,29 @@ Collections:
Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
URL: https://arxiv.org/abs/2105.10154
Models:
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_mbv3_coco_256x192.py
In Collection: ViPNAS
Metadata:
Architecture:
Architecture: &id001
- ViPNAS
Training Data: COCO
Name: topdown_heatmap_vipnas_mbv3_coco_256x192
README: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
Results:
- Dataset: COCO
Metrics:
AP: 0.7
[email protected]: 0.887
[email protected]: 0.778
AR: 0.757
[email protected]: 0.929
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192-7018731a_20211122.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
In Collection: ViPNAS
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_vipnas_res50_coco_256x192
README: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
Results:
Expand All @@ -19,6 +36,6 @@ Models:
[email protected]: 0.893
[email protected]: 0.789
AR: 0.769
[email protected]: 0.769
[email protected]: 0.934
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=10)
evaluation = dict(interval=10, metric='mAP', key_indicator='AP')

optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])

channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])

# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(type='ViPNAS_MobileNetV3'),
keypoint_head=dict(
type='ViPNASHeatmapSimpleHead',
in_channels=160,
out_channels=channel_cfg['num_output_channels'],
num_deconv_filters=(160, 160, 160),
num_deconv_groups=(160, 160, 160),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))

data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=30,
scale_factor=0.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]

val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]

test_pipeline = val_pipeline

data_root = 'data/coco'
data = dict(
samples_per_gpu=64,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
)
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
Collections:
- Metadata:
Architecture:
- ViPNAS
Name: wholebody--2d_kpt_sview_rgb_img--topdown_heatmap--coco-wholebody--vipnas_coco-wholebody
[ViPNAS@COCO-WholeBody]
- Name: ViPNAS
Paper:
Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
URL: https://arxiv.org/abs/2105.10154
README: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
Models:
- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192.py
In Collection: wholebody--2d_kpt_sview_rgb_img--topdown_heatmap--coco-wholebody--vipnas_coco-wholebody
[ViPNAS@COCO-WholeBody]
In Collection: ViPNAS
Metadata:
Architecture:
- ViPNAS
Training Data: COCO-WholeBody
Name: topdown_heatmap_vipnas_res50_coco_wholebody_256x192
README: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
Results:
- Dataset: COCO-WholeBody
Metrics:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
Collections:
- Metadata:
Architecture:
- ViPNAS
- DarkPose
Name: wholebody--2d_kpt_sview_rgb_img--topdown_heatmap--coco-wholebody--vipnas_dark_coco-wholebody
[ViPNAS+DarkPose@COCO-WholeBody]
- Name: ViPNAS
Paper:
Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
URL: https://arxiv.org/abs/2105.10154
README: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
Models:
- Config: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_res50_coco_wholebody_256x192_dark.py
In Collection: wholebody--2d_kpt_sview_rgb_img--topdown_heatmap--coco-wholebody--vipnas_dark_coco-wholebody
[ViPNAS+DarkPose@COCO-WholeBody]
In Collection: ViPNAS
Metadata:
Architecture:
- ViPNAS
- DarkPose
Training Data: COCO-WholeBody
Name: topdown_heatmap_vipnas_res50_coco_wholebody_256x192_dark
README: configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
Results:
- Dataset: COCO-WholeBody
Metrics:
Expand Down
4 changes: 3 additions & 1 deletion mmpose/models/backbones/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
from .shufflenet_v2 import ShuffleNetV2
from .tcn import TCN
from .vgg import VGG
from .vipnas_mbv3 import ViPNAS_MobileNetV3
from .vipnas_resnet import ViPNAS_ResNet

__all__ = [
'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'LiteHRNet'
'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
'LiteHRNet'
]
10 changes: 8 additions & 2 deletions mmpose/models/backbones/utils/inverted_residual.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class InvertedResidual(nn.Module):
mid_channels (int): The input channels of the depthwise convolution.
kernel_size (int): The kernel size of the depthwise convolution.
Default: 3.
groups (None or int): The group number of the depthwise convolution.
Default: None, which means group number = mid_channels.
stride (int): The stride of the depthwise convolution. Default: 1.
se_cfg (dict): Config dict for se layer. Default: None, which means no
se layer.
Expand All @@ -41,6 +43,7 @@ def __init__(self,
out_channels,
mid_channels,
kernel_size=3,
groups=None,
stride=1,
se_cfg=None,
with_expand_conv=True,
Expand All @@ -58,6 +61,9 @@ def __init__(self,
self.with_se = se_cfg is not None
self.with_expand_conv = with_expand_conv

if groups is None:
groups = mid_channels

if self.with_se:
assert isinstance(se_cfg, dict)
if not self.with_expand_conv:
Expand All @@ -79,7 +85,7 @@ def __init__(self,
kernel_size=kernel_size,
stride=stride,
padding=kernel_size // 2,
groups=mid_channels,
groups=groups,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
Expand All @@ -93,7 +99,7 @@ def __init__(self,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
act_cfg=None)

def forward(self, x):

Expand Down
Loading