From 4b4e63c5f5bb8fb9e9616efff7a09ff04d59e5c8 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Wed, 7 Dec 2022 14:41:19 +0800
Subject: [PATCH 01/14] Add ViTPose implementation on MMPose 1.x

---
 .../coco/ViTPose_base_coco_256x192.py         | 143 +++++++++++++++++
 .../coco/ViTPose_base_simple_coco_256x192.py  | 146 +++++++++++++++++
 .../coco/ViTPose_huge_coco_256x192.py         | 148 +++++++++++++++++
 .../coco/ViTPose_huge_simple_coco_256x192.py  | 151 ++++++++++++++++++
 .../coco/ViTPose_large_coco_256x192.py        | 143 +++++++++++++++++
 .../coco/ViTPose_large_simple_coco_256x192.py | 146 +++++++++++++++++
 mmpose/engine/optim_wrapper/__init__.py       |   3 +
 .../layer_decay_optim_wrapper.py              |  99 ++++++++++++
 mmpose/models/heads/base_head.py              |   8 +
 .../heads/heatmap_heads/heatmap_head.py       |   4 +-
 10 files changed, 990 insertions(+), 1 deletion(-)
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
 create mode 100644 mmpose/engine/optim_wrapper/__init__.py
 create mode 100644 mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
new file mode 100644
index 0000000000..fbdbdedb39
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=12,
+        layer_decay_rate=0.75,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch='base',
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=768,
+        deconv_out_channels=(256, 256),
+        deconv_kernel_sizes=(4, 4),
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
new file mode 100644
index 0000000000..18c8bdda2f
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=12,
+        layer_decay_rate=0.75,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch='base',
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=768,
+        deconv_out_channels=[],
+        deconv_kernel_sizes=[],
+        conv_out_channels=[17],
+        conv_kernel_sizes=[3],
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec,
+        upsample=4
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
new file mode 100644
index 0000000000..0d4827cd09
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
@@ -0,0 +1,148 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=32,
+        layer_decay_rate=0.85,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch=dict(
+            embed_dims=1280,
+            num_layers=32,
+            num_heads=16,
+            feedforward_channels=1280*4,
+        ),
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=1280,
+        deconv_out_channels=(256, 256),
+        deconv_kernel_sizes=(4, 4),
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
new file mode 100644
index 0000000000..8eb58b3f80
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=32,
+        layer_decay_rate=0.85,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch=dict(
+            embed_dims=1280,
+            num_layers=32,
+            num_heads=16,
+            feedforward_channels=1280*4,
+        ),
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=1280,
+        deconv_out_channels=[],
+        deconv_kernel_sizes=[],
+        conv_out_channels=[17],
+        conv_kernel_sizes=[3],
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec,
+        upsample=4
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
new file mode 100644
index 0000000000..3f3431c869
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=16,
+        layer_decay_rate=0.8,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch='large',
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=1024,
+        deconv_out_channels=(256, 256),
+        deconv_kernel_sizes=(4, 4),
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
new file mode 100644
index 0000000000..0f9fbf3aaa
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
@@ -0,0 +1,146 @@
+_base_ = [
+    '../../../_base_/default_runtime.py'
+]
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=24,
+        layer_decay_rate=0.8,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch='large',
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+        output_cls_token=False
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=1024,
+        deconv_out_channels=[],
+        deconv_kernel_sizes=[],
+        conv_out_channels=[17],
+        conv_kernel_sizes=[3],
+        out_channels=17,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec,
+        upsample=4
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/mmpose/engine/optim_wrapper/__init__.py b/mmpose/engine/optim_wrapper/__init__.py
new file mode 100644
index 0000000000..484649baed
--- /dev/null
+++ b/mmpose/engine/optim_wrapper/__init__.py
@@ -0,0 +1,3 @@
+from .layer_decay_optim_wrapper import LayerDecayOptimWrapperConstructor
+
+__all__ = ['LayerDecayOptimWrapperConstructor']
\ No newline at end of file
diff --git a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
new file mode 100644
index 0000000000..27b457081a
--- /dev/null
+++ b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
@@ -0,0 +1,99 @@
+from mmengine.optim import DefaultOptimWrapperConstructor, build_optim_wrapper
+from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.logging import print_log
+from mmengine.dist.utils import get_dist_info
+import json
+
+def get_num_layer_for_vit(var_name, num_max_layer):
+    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.patch_embed"):
+        return 0
+    elif var_name.startswith("backbone.layers"):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return num_max_layer - 1
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module(force=True)
+class LayerDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+        super().__init__(optim_wrapper_cfg, paramwise_cfg=None)
+        self.layer_decay_rate = paramwise_cfg.get("layer_decay_rate", 0.5)
+
+        super().__init__(optim_wrapper_cfg, paramwise_cfg)
+
+    def add_params(self, params, module, prefix="", lr=None):
+        parameter_groups = {}
+        print(self.paramwise_cfg)
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        print("Build LayerDecayOptimizerConstructor %f - %d" % (layer_decay_rate, num_layers))
+        weight_decay = self.base_wd
+
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or 'pos_embed' in name:
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+            layer_id = get_num_layer_for_vit(name, num_layers)
+            #print(f"the layer id is {layer_id} from {name}")
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+
+            if group_name not in parameter_groups:
+                scale = layer_decay_rate ** (num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [], 
+                    "lr_scale": scale, 
+                    "group_name": group_name, 
+                    "lr": scale * self.base_lr, 
+                }
+
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"], 
+                    "lr_scale": parameter_groups[key]["lr_scale"], 
+                    "lr": parameter_groups[key]["lr"], 
+                    "weight_decay": parameter_groups[key]["weight_decay"], 
+                }
+        params.extend(parameter_groups.values())
+
+"""
+import torch.nn as nn
+class ToyModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.layer = nn.ModuleDict(dict(linear=nn.Linear(1, 1)))
+        self.linear = nn.Linear(1, 1)
+
+model = ToyModel()
+optim_wrapper = dict(
+    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=12,
+        layer_decay_rate=0.75,
+        custom_keys={
+            "bias": dict(decay_multi=0.0),
+            "pos_embed": dict(decay_mult=0.0),
+            "relative_position_bias_table": dict(decay_mult=0.0),
+            "norm": dict(decay_mult=0.0),
+        },
+    ),
+    constructor="LayerDecayOptimWrapperConstructor",
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+optimizer = build_optim_wrapper(model, optim_wrapper)
+print("\n\n",optimizer)"""
diff --git a/mmpose/models/heads/base_head.py b/mmpose/models/heads/base_head.py
index e34c9363a5..0acd697724 100644
--- a/mmpose/models/heads/base_head.py
+++ b/mmpose/models/heads/base_head.py
@@ -12,6 +12,7 @@
 from mmpose.utils.tensor_utils import to_numpy
 from mmpose.utils.typing import (Features, InstanceList, OptConfigType,
                                  OptSampleList, Predictions)
+from mmpose.models.utils.ops import resize
 
 
 class BaseHead(BaseModule, metaclass=ABCMeta):
@@ -88,6 +89,13 @@ def _transform_inputs(
         elif self.input_transform == 'select':
             if isinstance(self.input_index, int):
                 inputs = feats[self.input_index]
+                if self.upsample > 0:
+                    inputs = resize(
+                        input=F.relu(inputs),
+                        scale_factor=self.upsample,
+                        mode='bilinear',
+                        align_corners=self.align_corners
+                        )
             else:
                 inputs = tuple(feats[i] for i in self.input_index)
         else:
diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py
index 9a06cef16d..4da49744fe 100644
--- a/mmpose/models/heads/heatmap_heads/heatmap_head.py
+++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py
@@ -84,7 +84,8 @@ def __init__(self,
                  loss: ConfigType = dict(
                      type='KeypointMSELoss', use_target_weight=True),
                  decoder: OptConfigType = None,
-                 init_cfg: OptConfigType = None):
+                 init_cfg: OptConfigType = None,
+                 upsample=0, ):
 
         if init_cfg is None:
             init_cfg = self.default_init_cfg
@@ -101,6 +102,7 @@ def __init__(self,
             self.decoder = KEYPOINT_CODECS.build(decoder)
         else:
             self.decoder = None
+        self.upsample = upsample
 
         # Get model input channels according to feature
         in_channels = self._get_in_channels()

From 2601cd4173dff16f5a28d850838138f9fa92fd3a Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Sun, 11 Dec 2022 00:48:53 +0800
Subject: [PATCH 02/14] add pretrained config for backbone

---
 .../coco/ViTPose_base_coco_256x192.py         | 18 +++++++++++-----
 .../coco/ViTPose_base_simple_coco_256x192.py  | 18 +++++++++++-----
 .../coco/ViTPose_huge_coco_256x192.py         | 15 +++++++------
 .../coco/ViTPose_huge_simple_coco_256x192.py  | 15 +++++++------
 .../coco/ViTPose_large_coco_256x192.py        | 21 ++++++++++++-------
 .../coco/ViTPose_large_simple_coco_256x192.py | 21 ++++++++++++-------
 6 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
index fbdbdedb39..36bdccb451 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
@@ -9,7 +9,11 @@
 custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type="AdamW",
+        lr=5e-4,
+        betas=(0.9, 0.999),
+        weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=12,
         layer_decay_rate=0.75,
@@ -63,14 +67,18 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.3,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_base.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=768,
+        out_channels=17,
         deconv_out_channels=(256, 256),
         deconv_kernel_sizes=(4, 4),
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec
     ),
@@ -81,7 +89,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -139,5 +147,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
index 18c8bdda2f..4eedc4945f 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
@@ -9,7 +9,11 @@
 custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type="AdamW", 
+        lr=5e-4, 
+        betas=(0.9, 0.999), 
+        weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=12,
         layer_decay_rate=0.75,
@@ -63,16 +67,20 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.3,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_base.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=768,
+        out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
         conv_out_channels=[17],
         conv_kernel_sizes=[3],
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
         upsample=4
@@ -84,7 +92,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -142,5 +150,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
index 0d4827cd09..f71babe7c3 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
@@ -68,17 +68,20 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.55,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_huge.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=1280,
+        out_channels=17,
         deconv_out_channels=(256, 256),
         deconv_kernel_sizes=(4, 4),
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
-        decoder=codec
-    ),
+        decoder=codec),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
@@ -86,7 +89,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -144,5 +147,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
index 8eb58b3f80..60298dc06b 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
@@ -68,20 +68,23 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.55,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_huge.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=1280,
+        out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
         conv_out_channels=[17],
         conv_kernel_sizes=[3],
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4
-    ),
+        upsample=4),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
@@ -89,7 +92,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -147,5 +150,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
index 3f3431c869..bba34bcb73 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
@@ -9,7 +9,11 @@
 custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type="AdamW",
+        lr=5e-4,
+        betas=(0.9, 0.999),
+        weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=16,
         layer_decay_rate=0.8,
@@ -63,17 +67,20 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.5,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_large.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=1024,
+        out_channels=17,
         deconv_out_channels=(256, 256),
         deconv_kernel_sizes=(4, 4),
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
-        decoder=codec
-    ),
+        decoder=codec),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
@@ -81,7 +88,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -139,5 +146,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
index 0f9fbf3aaa..f3147266c5 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
@@ -9,7 +9,11 @@
 custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type="AdamW",
+        lr=5e-4,
+        betas=(0.9, 0.999),
+        weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=24,
         layer_decay_rate=0.8,
@@ -63,20 +67,23 @@
         patch_size=16,
         qkv_bias=True,
         drop_path_rate=0.5,
-        output_cls_token=False
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_large.pth'),
     ),
     head=dict(
         type='HeatmapHead',
         in_channels=1024,
+        out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
         conv_out_channels=[17],
         conv_kernel_sizes=[3],
-        out_channels=17,
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4
-    ),
+        upsample=4),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
@@ -84,7 +91,7 @@
     ))
 
 # base dataset settings
-data_root = 'data/coco'
+data_root = 'data/coco/'
 dataset_type = 'CocoDataset'
 data_mode = 'topdown'
 
@@ -142,5 +149,5 @@
 # evaluators
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + '/annotations/person_keypoints_val2017.json')
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
 test_evaluator = val_evaluator
\ No newline at end of file

From ce9b8115e3f01d126db1f893a20f0d0d405c6f8b Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Mon, 12 Dec 2022 17:50:33 +0800
Subject: [PATCH 03/14] check for existence of attribute before access

---
 mmpose/models/heads/base_head.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmpose/models/heads/base_head.py b/mmpose/models/heads/base_head.py
index 0acd697724..e9fd3da057 100644
--- a/mmpose/models/heads/base_head.py
+++ b/mmpose/models/heads/base_head.py
@@ -89,7 +89,7 @@ def _transform_inputs(
         elif self.input_transform == 'select':
             if isinstance(self.input_index, int):
                 inputs = feats[self.input_index]
-                if self.upsample > 0:
+                if hasattr(self, 'upsample') and self.upsample > 0:
                     inputs = resize(
                         input=F.relu(inputs),
                         scale_factor=self.upsample,

From b424af8aa3a173e0e421b5e68074103b329cdb2b Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Mon, 12 Dec 2022 18:06:22 +0800
Subject: [PATCH 04/14] fix formats

---
 .../coco/ViTPose_base_coco_256x192.py         | 24 +++---
 .../coco/ViTPose_base_simple_coco_256x192.py  | 24 +++---
 .../coco/ViTPose_huge_coco_256x192.py         | 24 +++---
 .../coco/ViTPose_huge_simple_coco_256x192.py  | 24 +++---
 .../coco/ViTPose_large_coco_256x192.py        | 24 +++---
 .../coco/ViTPose_large_simple_coco_256x192.py | 24 +++---
 mmpose/engine/optim_wrapper/__init__.py       |  3 +-
 .../layer_decay_optim_wrapper.py              | 79 ++++++-------------
 8 files changed, 118 insertions(+), 108 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
index 36bdccb451..347ba5c0e9 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
@@ -6,11 +6,13 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
     optimizer=dict(
-        type="AdamW",
+        type='AdamW',
         lr=5e-4,
         betas=(0.9, 0.999),
         weight_decay=0.1),
@@ -18,13 +20,13 @@
         num_layers=12,
         layer_decay_rate=0.75,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -46,7 +48,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -148,4 +154,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
index 4eedc4945f..209838b2c2 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
@@ -6,11 +6,13 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
     optimizer=dict(
-        type="AdamW", 
+        type='AdamW', 
         lr=5e-4, 
         betas=(0.9, 0.999), 
         weight_decay=0.1),
@@ -18,13 +20,13 @@
         num_layers=12,
         layer_decay_rate=0.75,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -46,7 +48,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -151,4 +157,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
index f71babe7c3..9cc93eae31 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
@@ -6,21 +6,23 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=32,
         layer_decay_rate=0.85,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -42,7 +44,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -148,4 +154,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
index 60298dc06b..493b65b3f8 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
@@ -6,21 +6,23 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=32,
         layer_decay_rate=0.85,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -42,7 +44,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -151,4 +157,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
index bba34bcb73..4ef260b088 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
@@ -6,11 +6,13 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
     optimizer=dict(
-        type="AdamW",
+        type='AdamW',
         lr=5e-4,
         betas=(0.9, 0.999),
         weight_decay=0.1),
@@ -18,13 +20,13 @@
         num_layers=16,
         layer_decay_rate=0.8,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -46,7 +48,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -147,4 +153,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
index f3147266c5..c5e3cbb198 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
@@ -6,11 +6,13 @@
 train_cfg = dict(max_epochs=210, val_interval=10)
 
 # optimizer
-custom_imports = dict(imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'], allow_failed_imports=False)
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
 
 optim_wrapper = dict(
     optimizer=dict(
-        type="AdamW",
+        type='AdamW',
         lr=5e-4,
         betas=(0.9, 0.999),
         weight_decay=0.1),
@@ -18,13 +20,13 @@
         num_layers=24,
         layer_decay_rate=0.8,
         custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
         },
     ),
-    constructor="LayerDecayOptimWrapperConstructor",
+    constructor='LayerDecayOptimWrapperConstructor',
     clip_grad=dict(max_norm=1., norm_type=2),
 )
 
@@ -46,7 +48,11 @@
 auto_scale_lr = dict(base_batch_size=512)
 
 # hooks
-default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -150,4 +156,4 @@
 val_evaluator = dict(
     type='CocoMetric',
     ann_file=data_root + 'annotations/person_keypoints_val2017.json')
-test_evaluator = val_evaluator
\ No newline at end of file
+test_evaluator = val_evaluator
diff --git a/mmpose/engine/optim_wrapper/__init__.py b/mmpose/engine/optim_wrapper/__init__.py
index 484649baed..7c0b1f533a 100644
--- a/mmpose/engine/optim_wrapper/__init__.py
+++ b/mmpose/engine/optim_wrapper/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 from .layer_decay_optim_wrapper import LayerDecayOptimWrapperConstructor
 
-__all__ = ['LayerDecayOptimWrapperConstructor']
\ No newline at end of file
+__all__ = ['LayerDecayOptimWrapperConstructor']
diff --git a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
index 27b457081a..221966d64d 100644
--- a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
+++ b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
@@ -1,15 +1,15 @@
-from mmengine.optim import DefaultOptimWrapperConstructor, build_optim_wrapper
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim import DefaultOptimWrapperConstructor
 from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS
-from mmengine.logging import print_log
 from mmengine.dist.utils import get_dist_info
-import json
+
 
 def get_num_layer_for_vit(var_name, num_max_layer):
-    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
+    if var_name in ('backbone.cls_token', 'backbone.mask_token', 'backbone.pos_embed'):
         return 0
-    elif var_name.startswith("backbone.patch_embed"):
+    elif var_name.startswith('backbone.patch_embed'):
         return 0
-    elif var_name.startswith("backbone.layers"):
+    elif var_name.startswith('backbone.layers'):
         layer_id = int(var_name.split('.')[2])
         return layer_id + 1
     else:
@@ -19,81 +19,54 @@ def get_num_layer_for_vit(var_name, num_max_layer):
 class LayerDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor):
     def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
         super().__init__(optim_wrapper_cfg, paramwise_cfg=None)
-        self.layer_decay_rate = paramwise_cfg.get("layer_decay_rate", 0.5)
+        self.layer_decay_rate = paramwise_cfg.get('layer_decay_rate', 0.5)
 
         super().__init__(optim_wrapper_cfg, paramwise_cfg)
 
-    def add_params(self, params, module, prefix="", lr=None):
+    def add_params(self, params, module, prefix='', lr=None):
         parameter_groups = {}
         print(self.paramwise_cfg)
         num_layers = self.paramwise_cfg.get('num_layers') + 2
         layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
-        print("Build LayerDecayOptimizerConstructor %f - %d" % (layer_decay_rate, num_layers))
         weight_decay = self.base_wd
 
         for name, param in module.named_parameters():
             if not param.requires_grad:
                 continue  # frozen weights
-            if len(param.shape) == 1 or name.endswith(".bias") or 'pos_embed' in name:
-                group_name = "no_decay"
+            if (len(param.shape) == 1
+                or name.endswith('.bias')
+                or 'pos_embed' in name):
+                group_name = 'no_decay'
                 this_weight_decay = 0.
             else:
-                group_name = "decay"
+                group_name = 'decay'
                 this_weight_decay = weight_decay
             layer_id = get_num_layer_for_vit(name, num_layers)
-            #print(f"the layer id is {layer_id} from {name}")
-            group_name = "layer_%d_%s" % (layer_id, group_name)
+            group_name = 'layer_%d_%s' % (layer_id, group_name)
 
             if group_name not in parameter_groups:
                 scale = layer_decay_rate ** (num_layers - layer_id - 1)
 
                 parameter_groups[group_name] = {
-                    "weight_decay": this_weight_decay,
-                    "params": [],
-                    "param_names": [], 
-                    "lr_scale": scale, 
-                    "group_name": group_name, 
-                    "lr": scale * self.base_lr, 
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [], 
+                    'lr_scale': scale, 
+                    'group_name': group_name, 
+                    'lr': scale * self.base_lr, 
                 }
 
-            parameter_groups[group_name]["params"].append(param)
-            parameter_groups[group_name]["param_names"].append(name)
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
         rank, _ = get_dist_info()
         if rank == 0:
             to_display = {}
             for key in parameter_groups:
                 to_display[key] = {
-                    "param_names": parameter_groups[key]["param_names"], 
-                    "lr_scale": parameter_groups[key]["lr_scale"], 
-                    "lr": parameter_groups[key]["lr"], 
-                    "weight_decay": parameter_groups[key]["weight_decay"], 
+                    'param_names': parameter_groups[key]['param_names'], 
+                    'lr_scale': parameter_groups[key]['lr_scale'], 
+                    'lr': parameter_groups[key]['lr'], 
+                    'weight_decay': parameter_groups[key]['weight_decay'], 
                 }
         params.extend(parameter_groups.values())
 
-"""
-import torch.nn as nn
-class ToyModel(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.layer = nn.ModuleDict(dict(linear=nn.Linear(1, 1)))
-        self.linear = nn.Linear(1, 1)
-
-model = ToyModel()
-optim_wrapper = dict(
-    optimizer=dict(type="AdamW", lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
-    paramwise_cfg=dict(
-        num_layers=12,
-        layer_decay_rate=0.75,
-        custom_keys={
-            "bias": dict(decay_multi=0.0),
-            "pos_embed": dict(decay_mult=0.0),
-            "relative_position_bias_table": dict(decay_mult=0.0),
-            "norm": dict(decay_mult=0.0),
-        },
-    ),
-    constructor="LayerDecayOptimWrapperConstructor",
-    clip_grad=dict(max_norm=1., norm_type=2),
-)
-
-optimizer = build_optim_wrapper(model, optim_wrapper)
-print("\n\n",optimizer)"""

From d6807fdae342c4000c869b792dbfa288a8efa6d3 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Mon, 12 Dec 2022 18:29:42 +0800
Subject: [PATCH 05/14] fix formats

---
 .../coco/ViTPose_base_coco_256x192.py         | 17 +++-------
 .../coco/ViTPose_base_simple_coco_256x192.py  | 17 +++-------
 .../coco/ViTPose_huge_coco_256x192.py         | 14 +++------
 .../coco/ViTPose_huge_simple_coco_256x192.py  | 14 +++------
 .../coco/ViTPose_large_coco_256x192.py        | 14 ++-------
 .../coco/ViTPose_large_simple_coco_256x192.py | 14 ++-------
 .../layer_decay_optim_wrapper.py              | 31 ++++++++++---------
 mmpose/models/heads/base_head.py              |  3 +-
 .../heads/heatmap_heads/heatmap_head.py       |  3 +-
 9 files changed, 43 insertions(+), 84 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
index 347ba5c0e9..a7577c3ef8 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -12,10 +10,7 @@
 
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW',
-        lr=5e-4,
-        betas=(0.9, 0.999),
-        weight_decay=0.1),
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=12,
         layer_decay_rate=0.75,
@@ -49,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -86,8 +78,7 @@
         deconv_out_channels=(256, 256),
         deconv_kernel_sizes=(4, 4),
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
-        decoder=codec
-    ),
+        decoder=codec),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
index 209838b2c2..9aee1b9e29 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -12,10 +10,7 @@
 
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', 
-        lr=5e-4, 
-        betas=(0.9, 0.999), 
-        weight_decay=0.1),
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=12,
         layer_decay_rate=0.75,
@@ -49,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -89,8 +81,7 @@
         conv_kernel_sizes=[3],
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4
-    ),
+        upsample=4),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
index 9cc93eae31..1ee050f1e3 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -11,7 +9,8 @@
     allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=32,
         layer_decay_rate=0.85,
@@ -45,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -68,7 +64,7 @@
             embed_dims=1280,
             num_layers=32,
             num_heads=16,
-            feedforward_channels=1280*4,
+            feedforward_channels=1280 * 4,
         ),
         img_size=(256, 192),
         patch_size=16,
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
index 493b65b3f8..2543bb8868 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -11,7 +9,8 @@
     allow_failed_imports=False)
 
 optim_wrapper = dict(
-    optimizer=dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    optimizer=dict(
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=32,
         layer_decay_rate=0.85,
@@ -45,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
@@ -68,7 +64,7 @@
             embed_dims=1280,
             num_layers=32,
             num_heads=16,
-            feedforward_channels=1280*4,
+            feedforward_channels=1280 * 4,
         ),
         img_size=(256, 192),
         patch_size=16,
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
index 4ef260b088..407f0f94bd 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -12,10 +10,7 @@
 
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW',
-        lr=5e-4,
-        betas=(0.9, 0.999),
-        weight_decay=0.1),
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=16,
         layer_decay_rate=0.8,
@@ -49,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
index c5e3cbb198..af0f7aaa71 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../../_base_/default_runtime.py'
-]
+_base_ = ['../../../_base_/default_runtime.py']
 
 # runtime
 train_cfg = dict(max_epochs=210, val_interval=10)
@@ -12,10 +10,7 @@
 
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW',
-        lr=5e-4,
-        betas=(0.9, 0.999),
-        weight_decay=0.1),
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=24,
         layer_decay_rate=0.8,
@@ -49,10 +44,7 @@
 
 # hooks
 default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco/AP',
-        rule='greater',
-        max_keep_ckpts=1))
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
 
 # codec settings
 codec = dict(
diff --git a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
index 221966d64d..6513e5593d 100644
--- a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
+++ b/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dist.utils import get_dist_info
 from mmengine.optim import DefaultOptimWrapperConstructor
 from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS
-from mmengine.dist.utils import get_dist_info
 
 
 def get_num_layer_for_vit(var_name, num_max_layer):
-    if var_name in ('backbone.cls_token', 'backbone.mask_token', 'backbone.pos_embed'):
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
         return 0
     elif var_name.startswith('backbone.patch_embed'):
         return 0
@@ -15,8 +16,10 @@ def get_num_layer_for_vit(var_name, num_max_layer):
     else:
         return num_max_layer - 1
 
+
 @OPTIM_WRAPPER_CONSTRUCTORS.register_module(force=True)
 class LayerDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+
     def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
         super().__init__(optim_wrapper_cfg, paramwise_cfg=None)
         self.layer_decay_rate = paramwise_cfg.get('layer_decay_rate', 0.5)
@@ -33,9 +36,8 @@ def add_params(self, params, module, prefix='', lr=None):
         for name, param in module.named_parameters():
             if not param.requires_grad:
                 continue  # frozen weights
-            if (len(param.shape) == 1
-                or name.endswith('.bias')
-                or 'pos_embed' in name):
+            if (len(param.shape) == 1 or name.endswith('.bias')
+                    or 'pos_embed' in name):
                 group_name = 'no_decay'
                 this_weight_decay = 0.
             else:
@@ -45,15 +47,15 @@ def add_params(self, params, module, prefix='', lr=None):
             group_name = 'layer_%d_%s' % (layer_id, group_name)
 
             if group_name not in parameter_groups:
-                scale = layer_decay_rate ** (num_layers - layer_id - 1)
+                scale = layer_decay_rate**(num_layers - layer_id - 1)
 
                 parameter_groups[group_name] = {
                     'weight_decay': this_weight_decay,
                     'params': [],
-                    'param_names': [], 
-                    'lr_scale': scale, 
-                    'group_name': group_name, 
-                    'lr': scale * self.base_lr, 
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
                 }
 
             parameter_groups[group_name]['params'].append(param)
@@ -63,10 +65,9 @@ def add_params(self, params, module, prefix='', lr=None):
             to_display = {}
             for key in parameter_groups:
                 to_display[key] = {
-                    'param_names': parameter_groups[key]['param_names'], 
-                    'lr_scale': parameter_groups[key]['lr_scale'], 
-                    'lr': parameter_groups[key]['lr'], 
-                    'weight_decay': parameter_groups[key]['weight_decay'], 
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
                 }
         params.extend(parameter_groups.values())
-
diff --git a/mmpose/models/heads/base_head.py b/mmpose/models/heads/base_head.py
index e9fd3da057..b9b4825d21 100644
--- a/mmpose/models/heads/base_head.py
+++ b/mmpose/models/heads/base_head.py
@@ -94,8 +94,7 @@ def _transform_inputs(
                         input=F.relu(inputs),
                         scale_factor=self.upsample,
                         mode='bilinear',
-                        align_corners=self.align_corners
-                        )
+                        align_corners=self.align_corners)
             else:
                 inputs = tuple(feats[i] for i in self.input_index)
         else:
diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py
index 4da49744fe..b391bc67e3 100644
--- a/mmpose/models/heads/heatmap_heads/heatmap_head.py
+++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py
@@ -85,7 +85,8 @@ def __init__(self,
                      type='KeypointMSELoss', use_target_weight=True),
                  decoder: OptConfigType = None,
                  init_cfg: OptConfigType = None,
-                 upsample=0, ):
+                 upsample=0,
+    ):
 
         if init_cfg is None:
             init_cfg = self.default_init_cfg

From 4befb89804d264224e145c3485996ea59026653a Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Mon, 12 Dec 2022 18:37:39 +0800
Subject: [PATCH 06/14] fix indentation and import order

---
 mmpose/models/heads/base_head.py              |  2 +-
 .../heads/heatmap_heads/heatmap_head.py       | 33 ++++++++++---------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/mmpose/models/heads/base_head.py b/mmpose/models/heads/base_head.py
index b9b4825d21..40da595051 100644
--- a/mmpose/models/heads/base_head.py
+++ b/mmpose/models/heads/base_head.py
@@ -9,10 +9,10 @@
 from mmengine.structures import InstanceData
 from torch import Tensor
 
+from mmpose.models.utils.ops import resize
 from mmpose.utils.tensor_utils import to_numpy
 from mmpose.utils.typing import (Features, InstanceList, OptConfigType,
                                  OptSampleList, Predictions)
-from mmpose.models.utils.ops import resize
 
 
 class BaseHead(BaseModule, metaclass=ABCMeta):
diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py
index b391bc67e3..937b15da59 100644
--- a/mmpose/models/heads/heatmap_heads/heatmap_head.py
+++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py
@@ -70,22 +70,23 @@ class HeatmapHead(BaseHead):
 
     _version = 2
 
-    def __init__(self,
-                 in_channels: Union[int, Sequence[int]],
-                 out_channels: int,
-                 deconv_out_channels: OptIntSeq = (256, 256, 256),
-                 deconv_kernel_sizes: OptIntSeq = (4, 4, 4),
-                 conv_out_channels: OptIntSeq = None,
-                 conv_kernel_sizes: OptIntSeq = None,
-                 has_final_layer: bool = True,
-                 input_transform: str = 'select',
-                 input_index: Union[int, Sequence[int]] = -1,
-                 align_corners: bool = False,
-                 loss: ConfigType = dict(
-                     type='KeypointMSELoss', use_target_weight=True),
-                 decoder: OptConfigType = None,
-                 init_cfg: OptConfigType = None,
-                 upsample=0,
+    def __init__(
+        self,
+        in_channels: Union[int, Sequence[int]],
+        out_channels: int,
+        deconv_out_channels: OptIntSeq = (256, 256, 256),
+        deconv_kernel_sizes: OptIntSeq = (4, 4, 4),
+        conv_out_channels: OptIntSeq = None,
+        conv_kernel_sizes: OptIntSeq = None,
+        has_final_layer: bool = True,
+        input_transform: str = 'select',
+        input_index: Union[int, Sequence[int]] = -1,
+        align_corners: bool = False,
+        loss: ConfigType = dict(
+            type='KeypointMSELoss', use_target_weight=True),
+        decoder: OptConfigType = None,
+        init_cfg: OptConfigType = None,
+        upsample=0,
     ):
 
         if init_cfg is None:

From 73bc29b35d23a7cdcaca0ef6db448eeeb3c99470 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Fri, 23 Dec 2022 15:56:46 +0800
Subject: [PATCH 07/14] rename and add algorithm description

---
 ...se-base-simple_8xb64-210e_coco-256x192.py} |  0
 ...m_ViTPose-base_8xb64-210e_coco-256x192.py} |  0
 ...se-huge-simple_8xb64-210e_coco-256x192.py} |  0
 ...m_ViTPose-huge_8xb64-210e_coco-256x192.py} |  0
 ...e-large-simple_8xb64-210e_coco-256x192.py} |  0
 ..._ViTPose-large_8xb64-210e_coco-256x192.py} |  0
 docs/src/papers/algorithms/vitpose.md         | 45 +++++++++++++++++++
 7 files changed, 45 insertions(+)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_base_simple_coco_256x192.py => td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py} (100%)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_base_coco_256x192.py => td-hm_ViTPose-base_8xb64-210e_coco-256x192.py} (100%)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_huge_simple_coco_256x192.py => td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py} (100%)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_huge_coco_256x192.py => td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py} (100%)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_large_simple_coco_256x192.py => td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py} (100%)
 rename configs/body_2d_keypoint/topdown_heatmap/coco/{ViTPose_large_coco_256x192.py => td-hm_ViTPose-large_8xb64-210e_coco-256x192.py} (100%)
 create mode 100644 docs/src/papers/algorithms/vitpose.md

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_simple_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_base_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_simple_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_simple_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
similarity index 100%
rename from configs/body_2d_keypoint/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
rename to configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
diff --git a/docs/src/papers/algorithms/vitpose.md b/docs/src/papers/algorithms/vitpose.md
new file mode 100644
index 0000000000..d180afe353
--- /dev/null
+++ b/docs/src/papers/algorithms/vitpose.md
@@ -0,0 +1,45 @@
+# Deep high-resolution representation learning for human pose estimation
+
+  
+
+<!-- [ALGORITHM] -->
+
+  
+
+<details>
+
+<summary  align="right"><a  href="https://arxiv.org/abs/2204.12484">ViTPose</a></summary>
+
+  
+
+```bibtex
+
+@misc{https://doi.org/10.48550/arxiv.2204.12484,
+  doi = {10.48550/ARXIV.2204.12484},
+  url = {https://arxiv.org/abs/2204.12484},
+  author = {Xu, Yufei and Zhang, Jing and Zhang, Qiming and Tao, Dacheng},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+
+```
+
+  
+
+</details>
+
+  
+
+## Abstract
+
+  
+
+<!-- [ABSTRACT] -->
+
+  
+
+
+Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art, i.e., 80.9 AP on the MS COCO test-dev set.
\ No newline at end of file

From 8f0e873339ac8990c9de17aca0df274252f592cd Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Tue, 27 Dec 2022 23:24:53 +0800
Subject: [PATCH 08/14] follow changes in original repo

---
 .../td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py   | 7 +------
 .../coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py     | 7 +------
 .../coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py    | 2 +-
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
index 2543bb8868..d49e625cbd 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
@@ -60,12 +60,7 @@
         bgr_to_rgb=True),
     backbone=dict(
         type='mmcls.VisionTransformer',
-        arch=dict(
-            embed_dims=1280,
-            num_layers=32,
-            num_heads=16,
-            feedforward_channels=1280 * 4,
-        ),
+        arch='huge',
         img_size=(256, 192),
         patch_size=16,
         qkv_bias=True,
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
index 1ee050f1e3..0fc03d1f42 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
@@ -60,12 +60,7 @@
         bgr_to_rgb=True),
     backbone=dict(
         type='mmcls.VisionTransformer',
-        arch=dict(
-            embed_dims=1280,
-            num_layers=32,
-            num_heads=16,
-            feedforward_channels=1280 * 4,
-        ),
+        arch='huge',
         img_size=(256, 192),
         patch_size=16,
         qkv_bias=True,
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
index 407f0f94bd..0a334c81db 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
@@ -12,7 +12,7 @@
     optimizer=dict(
         type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
-        num_layers=16,
+        num_layers=24,
         layer_decay_rate=0.8,
         custom_keys={
             'bias': dict(decay_multi=0.0),

From 4d6102cfd4c0774716504b50c3809797fe437adc Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Wed, 1 Feb 2023 11:27:14 +0800
Subject: [PATCH 09/14] Correct structure for simple decoders

---
 ...ose-base-simple_8xb64-210e_coco-256x192.py |   5 +-
 ...ose-huge-simple_8xb64-210e_coco-256x192.py |   5 +-
 ...se-large-simple_8xb64-210e_coco-256x192.py |   5 +-
 ...m_ViTPose-small_8xb64-210e_coco-256x192.py | 153 ++++++++++++++++++
 .../heads/heatmap_heads/heatmap_head.py       |  23 ++-
 5 files changed, 179 insertions(+), 12 deletions(-)
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
index 9aee1b9e29..677bd2d33d 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
@@ -77,11 +77,10 @@
         out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
-        conv_out_channels=[17],
-        conv_kernel_sizes=[3],
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4),
+        extra=dict(upsample=4, final_conv_kernel=3),
+    ),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
index d49e625cbd..9c967fa4a0 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
@@ -77,11 +77,10 @@
         out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
-        conv_out_channels=[17],
-        conv_kernel_sizes=[3],
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4),
+        extra=dict(upsample=4, final_conv_kernel=3),
+    ),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
index af0f7aaa71..1bbd956f55 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
@@ -77,11 +77,10 @@
         out_channels=17,
         deconv_out_channels=[],
         deconv_kernel_sizes=[],
-        conv_out_channels=[17],
-        conv_kernel_sizes=[3],
         loss=dict(type='KeypointMSELoss', use_target_weight=True),
         decoder=codec,
-        upsample=4),
+        extra=dict(upsample=4, final_conv_kernel=3),
+    ),
     test_cfg=dict(
         flip_test=True,
         flip_mode='heatmap',
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000000..4996a72112
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
@@ -0,0 +1,153 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=12,
+        layer_decay_rate=0.9,
+        custom_keys={
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
+        },
+    ),
+    constructor='LayerDecayOptimWrapperConstructor',
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch={
+            'embed_dims': 384,
+            'num_layers': 12,
+            'num_heads': 12,
+            'feedforward_channels': 384 * 4
+        },
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        with_cls_token=False,
+        output_cls_token=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_small.pth'),
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=384,
+        out_channels=17,
+        deconv_out_channels=(256, 256),
+        deconv_kernel_sizes=(4, 4),
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py
index 937b15da59..b5cf58523a 100644
--- a/mmpose/models/heads/heatmap_heads/heatmap_head.py
+++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py
@@ -64,6 +64,8 @@ class HeatmapHead(BaseHead):
             keypoint coordinates from the network output. Defaults to ``None``
         init_cfg (Config, optional): Config to control the initialization. See
             :attr:`default_init_cfg` for default settings
+        extra (dict, optional): Extra configurations.
+            Defaults to ``None``
 
     .. _`Simple Baselines`: https://arxiv.org/abs/1804.06208
     """
@@ -86,7 +88,7 @@ def __init__(
             type='KeypointMSELoss', use_target_weight=True),
         decoder: OptConfigType = None,
         init_cfg: OptConfigType = None,
-        upsample=0,
+        extra=None
     ):
 
         if init_cfg is None:
@@ -104,7 +106,21 @@ def __init__(
             self.decoder = KEYPOINT_CODECS.build(decoder)
         else:
             self.decoder = None
-        self.upsample = upsample
+        self.upsample = 0
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+        
+        kernel_size = 1
+        padding = 0
+        if extra is not None:
+            if 'upsample' in extra:
+                self.upsample = extra['upsample']
+            if 'final_conv_kernel' in extra:
+                assert extra['final_conv_kernel'] in [1, 3]
+                if extra['final_conv_kernel'] == 3:
+                    padding = 1
+                kernel_size = extra['final_conv_kernel']
 
         # Get model input channels according to feature
         in_channels = self._get_in_channels()
@@ -153,7 +169,8 @@ def __init__(
                 type='Conv2d',
                 in_channels=in_channels,
                 out_channels=out_channels,
-                kernel_size=1)
+                padding=padding,
+                kernel_size=kernel_size)
             self.final_layer = build_conv_layer(cfg)
         else:
             self.final_layer = nn.Identity()

From c46ffc6c7d9b50b107cf57af086aed51c25c49f3 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Wed, 15 Feb 2023 21:16:29 +0800
Subject: [PATCH 10/14] Change configs, add val results, rename folder

---
 ...ose-base-simple_8xb64-210e_coco-256x192.py |   3 +-
 ...hm_ViTPose-base_8xb64-210e_coco-256x192.py |   3 +-
 ...ose-huge-simple_8xb64-210e_coco-256x192.py |   3 +-
 ...hm_ViTPose-huge_8xb64-210e_coco-256x192.py |   3 +-
 ...se-large-simple_8xb64-210e_coco-256x192.py |   3 +-
 ...m_ViTPose-large_8xb64-210e_coco-256x192.py |   3 +-
 ...se-small-simple_8xb64-210e_coco-256x192.py | 156 ++++++++++++++++++
 ...m_ViTPose-small_8xb64-210e_coco-256x192.py |   7 +-
 .../topdown_heatmap/coco/vitpose_coco.md      |  59 +++++++
 .../__init__.py                               |   0
 .../layer_decay_optim_wrapper.py              |   0
 11 files changed, 231 insertions(+), 9 deletions(-)
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
 create mode 100644 configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
 rename mmpose/engine/{optim_wrapper => optim_wrappers}/__init__.py (100%)
 rename mmpose/engine/{optim_wrapper => optim_wrappers}/layer_decay_optim_wrapper.py (100%)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
index 677bd2d33d..b3edbfa4b2 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.3,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_base.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
index a7577c3ef8..f1fbd2d857 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.3,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_base.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
index 9c967fa4a0..797192cb25 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.55,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_huge.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
index 0fc03d1f42..43df966568 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.55,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_huge.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
index 1bbd956f55..9413665e6a 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.5,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_large.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
index 0a334c81db..3f67f9999f 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -67,6 +67,7 @@
         drop_path_rate=0.5,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_large.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000000..fdd8428891
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
@@ -0,0 +1,156 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+    allow_failed_imports=False)
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+    paramwise_cfg=dict(
+        num_layers=12,
+        layer_decay_rate=0.8,
+        custom_keys={
+            'bias': dict(decay_multi=0.0),
+            'pos_embed': dict(decay_mult=0.0),
+            'relative_position_bias_table': dict(decay_mult=0.0),
+            'norm': dict(decay_mult=0.0),
+        },
+    ),
+    constructor='LayerDecayOptimWrapperConstructor',
+    clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', begin=0, end=500, start_factor=0.001,
+        by_epoch=False),  # warm-up
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=210,
+        milestones=[170, 200],
+        gamma=0.1,
+        by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+    type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.VisionTransformer',
+        arch={
+            'embed_dims': 384,
+            'num_layers': 12,
+            'num_heads': 12,
+            'feedforward_channels': 384 * 4
+        },
+        img_size=(256, 192),
+        patch_size=16,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        with_cls_token=False,
+        output_cls_token=False,
+        patch_cfg=dict(padding=2),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrained/mae_pretrain_vit_small.pth'),
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=384,
+        out_channels=17,
+        deconv_out_channels=[],
+        deconv_kernel_sizes=[],
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec,
+        extra=dict(upsample=4, final_conv_kernel=3),
+    ),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=False,
+    ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(type='RandomBBoxTransform'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args={{_base_.file_client_args}}),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+    dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
index 4996a72112..f50ce7a9c7 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
@@ -5,7 +5,7 @@
 
 # optimizer
 custom_imports = dict(
-    imports=['mmpose.engine.optim_wrapper.layer_decay_optim_wrapper'],
+    imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
     allow_failed_imports=False)
 
 optim_wrapper = dict(
@@ -13,7 +13,7 @@
         type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
     paramwise_cfg=dict(
         num_layers=12,
-        layer_decay_rate=0.9,
+        layer_decay_rate=0.8,
         custom_keys={
             'bias': dict(decay_multi=0.0),
             'pos_embed': dict(decay_mult=0.0),
@@ -69,9 +69,10 @@
         img_size=(256, 192),
         patch_size=16,
         qkv_bias=True,
-        drop_path_rate=0.3,
+        drop_path_rate=0.1,
         with_cls_token=False,
         output_cls_token=False,
+        patch_cfg=dict(padding=2),
         init_cfg=dict(
             type='Pretrained',
             checkpoint='pretrained/mae_pretrain_vit_small.pth'),
diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
new file mode 100644
index 0000000000..34612df7e7
--- /dev/null
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
@@ -0,0 +1,59 @@
+<!-- [ALGORITHM] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2204.12484">ViTPose</a></summary>
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2204.12484,
+  doi = {10.48550/ARXIV.2204.12484},
+  url = {https://arxiv.org/abs/2204.12484},
+  author = {Xu, Yufei and Zhang, Jing and Zhang, Qiming and Tao, Dacheng},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58545-7_12">COCO-WholeBody (ECCV'2020)</a></summary>
+
+```bibtex
+@inproceedings{jin2020whole,
+  title={Whole-Body Human Pose Estimation in the Wild},
+  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2020}
+}
+```
+
+</details>
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](<>) |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](<>) |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 |  0.952 | [ckpt](<>) |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.790 | 0.916 | 0.857 | 0.840 | 0.953 | [ckpt](<>) |
+
+*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
+
+> With simple decoder
+
+| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.736 | 0.900 | 0.811 | 0.790 | 0.940 | [ckpt](<>) |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.754 | 0.905 | 0.825 | 0.807 | 0.945 | [ckpt](<>) |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.781 | 0.914 | 0.853 | 0.833 | 0.952 | [ckpt](<>) |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.789 | 0.916 | 0.856 | 0.839 | 0.953 | [ckpt](<>) |
+
+*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
diff --git a/mmpose/engine/optim_wrapper/__init__.py b/mmpose/engine/optim_wrappers/__init__.py
similarity index 100%
rename from mmpose/engine/optim_wrapper/__init__.py
rename to mmpose/engine/optim_wrappers/__init__.py
diff --git a/mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py b/mmpose/engine/optim_wrappers/layer_decay_optim_wrapper.py
similarity index 100%
rename from mmpose/engine/optim_wrapper/layer_decay_optim_wrapper.py
rename to mmpose/engine/optim_wrappers/layer_decay_optim_wrapper.py

From 9f43c20dac357c713224116004b463eb5759cd48 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Wed, 15 Feb 2023 21:28:02 +0800
Subject: [PATCH 11/14] Fix formats

---
 docs/src/papers/algorithms/vitpose.md         | 17 +--------
 .../heads/heatmap_heads/heatmap_head.py       | 36 +++++++++----------
 2 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/docs/src/papers/algorithms/vitpose.md b/docs/src/papers/algorithms/vitpose.md
index d180afe353..99fc2650f0 100644
--- a/docs/src/papers/algorithms/vitpose.md
+++ b/docs/src/papers/algorithms/vitpose.md
@@ -1,17 +1,11 @@
 # Deep high-resolution representation learning for human pose estimation
 
-  
-
 <!-- [ALGORITHM] -->
 
-  
-
 <details>
 
 <summary  align="right"><a  href="https://arxiv.org/abs/2204.12484">ViTPose</a></summary>
 
-  
-
 ```bibtex
 
 @misc{https://doi.org/10.48550/arxiv.2204.12484,
@@ -27,19 +21,10 @@
 
 ```
 
-  
-
 </details>
 
-  
-
 ## Abstract
 
-  
-
 <!-- [ABSTRACT] -->
 
-  
-
-
-Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art, i.e., 80.9 AP on the MS COCO test-dev set.
\ No newline at end of file
+Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art, i.e., 80.9 AP on the MS COCO test-dev set.
diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py
index b5cf58523a..02ca7a893a 100644
--- a/mmpose/models/heads/heatmap_heads/heatmap_head.py
+++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py
@@ -72,24 +72,22 @@ class HeatmapHead(BaseHead):
 
     _version = 2
 
-    def __init__(
-        self,
-        in_channels: Union[int, Sequence[int]],
-        out_channels: int,
-        deconv_out_channels: OptIntSeq = (256, 256, 256),
-        deconv_kernel_sizes: OptIntSeq = (4, 4, 4),
-        conv_out_channels: OptIntSeq = None,
-        conv_kernel_sizes: OptIntSeq = None,
-        has_final_layer: bool = True,
-        input_transform: str = 'select',
-        input_index: Union[int, Sequence[int]] = -1,
-        align_corners: bool = False,
-        loss: ConfigType = dict(
-            type='KeypointMSELoss', use_target_weight=True),
-        decoder: OptConfigType = None,
-        init_cfg: OptConfigType = None,
-        extra=None
-    ):
+    def __init__(self,
+                 in_channels: Union[int, Sequence[int]],
+                 out_channels: int,
+                 deconv_out_channels: OptIntSeq = (256, 256, 256),
+                 deconv_kernel_sizes: OptIntSeq = (4, 4, 4),
+                 conv_out_channels: OptIntSeq = None,
+                 conv_kernel_sizes: OptIntSeq = None,
+                 has_final_layer: bool = True,
+                 input_transform: str = 'select',
+                 input_index: Union[int, Sequence[int]] = -1,
+                 align_corners: bool = False,
+                 loss: ConfigType = dict(
+                     type='KeypointMSELoss', use_target_weight=True),
+                 decoder: OptConfigType = None,
+                 init_cfg: OptConfigType = None,
+                 extra=None):
 
         if init_cfg is None:
             init_cfg = self.default_init_cfg
@@ -110,7 +108,7 @@ def __init__(
 
         if extra is not None and not isinstance(extra, dict):
             raise TypeError('extra should be dict or None.')
-        
+
         kernel_size = 1
         padding = 0
         if extra is not None:

From 085477eab3e7a3475d9ee98425332b29953f2c9a Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Fri, 17 Feb 2023 13:51:25 +0800
Subject: [PATCH 12/14] Update markdown file

---
 .../topdown_heatmap/coco/vitpose_coco.md      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
index 34612df7e7..04d4063128 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
@@ -1,4 +1,4 @@
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <details>
 <summary align="right"><a href="https://arxiv.org/abs/2204.12484">ViTPose</a></summary>
@@ -38,22 +38,22 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 
 > With classic decoder
 
-| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt |
-| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: |
-| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](<>) |
-| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](<>) |
-| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 |  0.952 | [ckpt](<>) |
-| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.790 | 0.916 | 0.857 | 0.840 | 0.953 | [ckpt](<>) |
+| Arch                                                                                                        | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |   ckpt   |
+| :---------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py)  |  256x192   | 0.739 |      0.903      |      0.816      | 0.792 |      0.942      | \[ckpt\] |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py)   |  256x192   | 0.757 |      0.905      |      0.829      | 0.810 |      0.946      | \[ckpt\] |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py)  |  256x192   | 0.782 |      0.914      |      0.850      | 0.834 |      0.952      | \[ckpt\] |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) |  256x192   | 0.790 |      0.916      |      0.857      | 0.840 |      0.953      | \[ckpt\] |
 
 *Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
 
 > With simple decoder
 
-| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt |
-| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: |
-| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.736 | 0.900 | 0.811 | 0.790 | 0.940 | [ckpt](<>) |
-| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.754 | 0.905 | 0.825 | 0.807 | 0.945 | [ckpt](<>) |
-| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.781 | 0.914 | 0.853 | 0.833 | 0.952 | [ckpt](<>) |
-| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.789 | 0.916 | 0.856 | 0.839 | 0.953 | [ckpt](<>) |
+| Arch                                                                                                               | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |    ckpt    |
+| :----------------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.736 |      0.900      |      0.811      | 0.790 |      0.940      |  \[ckpt\]  |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py)   |  256x192   | 0.756 |      0.906      |      0.826      | 0.809 |      0.946      |  \[ckpt\]  |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.781 |      0.914      |      0.853      | 0.833 |      0.952      | [ckpt](<>) |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) |  256x192   | 0.789 |      0.916      |      0.856      | 0.839 |      0.953      |  \[ckpt\]  |
 
 *Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*

From c611933d06db45b310f8cd4d2fc0b98c0ccd9689 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Thu, 23 Feb 2023 16:14:11 +0800
Subject: [PATCH 13/14] Update training results

---
 .../topdown_heatmap/coco/vitpose_coco.md      | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
index 04d4063128..5cad4cf29e 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
@@ -38,22 +38,18 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 
 > With classic decoder
 
-| Arch                                                                                                        | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |   ckpt   |
-| :---------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------: |
-| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py)  |  256x192   | 0.739 |      0.903      |      0.816      | 0.792 |      0.942      | \[ckpt\] |
-| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py)   |  256x192   | 0.757 |      0.905      |      0.829      | 0.810 |      0.946      | \[ckpt\] |
-| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py)  |  256x192   | 0.782 |      0.914      |      0.850      | 0.834 |      0.952      | \[ckpt\] |
-| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) |  256x192   | 0.790 |      0.916      |      0.857      | 0.840 |      0.953      | \[ckpt\] |
-
-*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
+| Arch                                                                                                       | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |   ckpt   |
+| :--------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) |  256x192   | 0.739 |      0.903      |      0.816      | 0.792 |      0.942      | \[ckpt\] |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py)  |  256x192   | 0.757 |      0.905      |      0.829      | 0.810 |      0.946      | \[ckpt\] |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) |  256x192   | 0.782 |      0.914      |      0.850      | 0.834 |      0.952      | \[ckpt\] |
+| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py)  |  256x192   | 0.788 |      0.915      |      0.855      | 0.840 |      0.954      | \[ckpt\] |
 
 > With simple decoder
 
-| Arch                                                                                                               | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |    ckpt    |
-| :----------------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------: |
-| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.736 |      0.900      |      0.811      | 0.790 |      0.940      |  \[ckpt\]  |
-| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py)   |  256x192   | 0.756 |      0.906      |      0.826      | 0.809 |      0.946      |  \[ckpt\]  |
-| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.781 |      0.914      |      0.853      | 0.833 |      0.952      | [ckpt](<>) |
-| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) |  256x192   | 0.789 |      0.916      |      0.856      | 0.839 |      0.953      |  \[ckpt\]  |
-
-*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
+| Arch                                                                                                              | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |    ckpt    |
+| :---------------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) |  256x192   | 0.736 |      0.900      |      0.811      | 0.790 |      0.940      |  \[ckpt\]  |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.756 |      0.906      |      0.826      | 0.809 |      0.946      |  \[ckpt\]  |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) |  256x192   | 0.781 |      0.914      |      0.853      | 0.833 |      0.952      | [ckpt](<>) |
+| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py)  |  256x192   | 0.789 |      0.916      |      0.856      | 0.839 |      0.953      |  \[ckpt\]  |

From 23b7838f56f29d691d80e46a20d134c382f58305 Mon Sep 17 00:00:00 2001
From: LareinaM <mhsj16lareina@gmail.com>
Date: Fri, 10 Mar 2023 14:03:42 +0800
Subject: [PATCH 14/14] Update markdown file - include testing results from
 original repo - update training results

---
 .../topdown_heatmap/coco/vitpose_coco.md          | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
index 5cad4cf29e..77c1b03124 100644
--- a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
+++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
@@ -38,12 +38,15 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 
 > With classic decoder
 
-| Arch                                                                                                       | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |   ckpt   |
-| :--------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------: |
-| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) |  256x192   | 0.739 |      0.903      |      0.816      | 0.792 |      0.942      | \[ckpt\] |
-| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py)  |  256x192   | 0.757 |      0.905      |      0.829      | 0.810 |      0.946      | \[ckpt\] |
-| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) |  256x192   | 0.782 |      0.914      |      0.850      | 0.834 |      0.952      | \[ckpt\] |
-| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py)  |  256x192   | 0.788 |      0.915      |      0.855      | 0.840 |      0.954      | \[ckpt\] |
+| Arch                                                                                                        | Input Size |  AP   | AP<sup>50</sup> | AP<sup>75</sup> |  AR   | AR<sup>50</sup> |   ckpt   |
+| :---------------------------------------------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py)  |  256x192   | 0.739 |      0.903      |      0.816      | 0.792 |      0.942      | \[ckpt\] |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py)   |  256x192   | 0.757 |      0.905      |      0.829      | 0.810 |      0.946      | \[ckpt\] |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py)  |  256x192   | 0.782 |      0.914      |      0.850      | 0.834 |      0.952      | \[ckpt\] |
+| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py)   |  256x192   | 0.788 |      0.917      |      0.855      | 0.839 |      0.954      | \[ckpt\] |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) |  256x192   | 0.790 |     0.0.916     |      0.857      | 0.840 |      0.953      | \[ckpt\] |
+
+*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose).  The config files of these models are only for validation.*
 
 > With simple decoder