From 9ab7286837c22b65ddff2fda5158ae9dc266cbaa Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Wed, 23 Feb 2022 07:47:59 +0000
Subject: [PATCH 1/8] update docs for maskformer

---
 mmdet/models/dense_heads/maskformer_head.py | 84 +++++++++++----------
 mmdet/models/detectors/maskformer.py        |  2 +-
 mmdet/models/plugins/pixel_decoder.py       |  8 +-
 3 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
index 3cd060e53b6..2234b25a1a0 100644
--- a/mmdet/models/dense_heads/maskformer_head.py
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -28,24 +28,24 @@ class MaskFormerHead(AnchorFreeHead):
         num_things_classes (int): Number of things.
         num_stuff_classes (int): Number of stuff.
         num_queries (int): Number of query in Transformer.
-        pixel_decoder (obj:`mmcv.ConfigDict`|dict): Config for pixel decoder.
+        pixel_decoder (obj:`mmcv.ConfigDict` | dict): Config for pixel decoder.
             Defaults to None.
         enforce_decoder_input_project (bool, optional): Whether to add a layer
             to change the embed_dim of tranformer encoder in pixel decoder to
             the embed_dim of transformer decoder. Defaults to False.
-        transformer_decoder (obj:`mmcv.ConfigDict`|dict): Config for
+        transformer_decoder (obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder. Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder position encoding. Defaults to None.
-        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification
+        loss_cls (obj:`mmcv.ConfigDict` | dict): Config of the classification
             loss. Defaults to `CrossEntropyLoss`.
-        loss_mask (obj:`mmcv.ConfigDict`|dict): Config of the mask loss.
+        loss_mask (obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
             Defaults to `FocalLoss`.
-        loss_dice (obj:`mmcv.ConfigDict`|dict): Config of the dice loss.
+        loss_dice (obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
             Defaults to `DiceLoss`.
-        train_cfg (obj:`mmcv.ConfigDict`|dict): Training config of Maskformer
+        train_cfg (obj:`mmcv.ConfigDict` | dict): Training config of Maskformer
             head.
-        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of Maskformer
+        test_cfg (obj:`mmcv.ConfigDict` | dict): Testing config of Maskformer
             head.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to None.
@@ -178,11 +178,11 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs):
         Returns:
             tuple: a tuple containing the following targets.
 
-                - labels (list[Tensor]): Ground truth class indices for all\
-                    images. Each with shape (n, ), n is the sum of number\
-                    of stuff type and number of instance in a image.
-                - masks (list[Tensor]): Ground truth mask for each image, each\
-                    with shape (n, h, w).
+            - | ``labels`` (list[Tensor]): Ground truth class indices\
+                for all images. Each with shape (n, ), n is the sum of\
+                number of stuff type and number of instance in a image.
+            - | ``masks`` (list[Tensor]): Ground truth mask for each\
+                image, each with shape (n, h, w).
         """
         num_things_list = [self.num_things_classes] * len(gt_labels_list)
         num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list)
@@ -214,18 +214,18 @@ def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
         Returns:
             tuple[list[Tensor]]: a tuple containing the following targets.
 
-                - labels_list (list[Tensor]): Labels of all images.\
-                    Each with shape (num_queries, ).
-                - label_weights_list (list[Tensor]): Label weights of all\
-                    images. Each with shape (num_queries, ).
-                - mask_targets_list (list[Tensor]): Mask targets of all\
-                    images. Each with shape (num_queries, h, w).
-                - mask_weights_list (list[Tensor]): Mask weights of all\
-                    images. Each with shape (num_queries, ).
-                - num_total_pos (int): Number of positive samples in all\
-                    images.
-                - num_total_neg (int): Number of negative samples in all\
-                    images.
+            - | ``labels_list`` (list[Tensor]): Labels of all images.\
+                Each with shape (num_queries, ).
+            - | ``label_weights_list`` (list[Tensor]): Label weights\
+                of all images. Each with shape (num_queries, ).
+            - | ``mask_targets_list`` (list[Tensor]): Mask targets of\
+                all images. Each with shape (num_queries, h, w).
+            - | ``mask_weights_list`` (list[Tensor]): Mask weights of\
+                all images. Each with shape (num_queries, ).
+            - | ``num_total_pos`` (int): Number of positive samples in\
+                all images.
+            - | ``num_total_neg`` (int): Number of negative samples in\
+                all images.
         """
         (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
          pos_inds_list,
@@ -257,16 +257,16 @@ def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
         Returns:
             tuple[Tensor]: a tuple containing the following for one image.
 
-                - labels (Tensor): Labels of each image.
-                    shape (num_queries, ).
-                - label_weights (Tensor): Label weights of each image.
-                    shape (num_queries, ).
-                - mask_targets (Tensor): Mask targets of each image.
-                    shape (num_queries, h, w).
-                - mask_weights (Tensor): Mask weights of each image.
-                    shape (num_queries, ).
-                - pos_inds (Tensor): Sampled positive indices for each image.
-                - neg_inds (Tensor): Sampled negative indices for each image.
+            - | ``labels`` (Tensor): Labels of each image.
+                shape (num_queries, ).
+            - | ``label_weights`` (Tensor): Label weights of each image.
+                shape (num_queries, ).
+            - | ``mask_targets`` (Tensor): Mask targets of each image.
+                shape (num_queries, h, w).
+            - | ``mask_weights`` (Tensor): Mask weights of each image.
+                shape (num_queries, ).
+            - | ``pos_inds`` (Tensor): Sampled positive indices for each image.
+            - | ``neg_inds`` (Tensor): Sampled negative indices for each image.
         """
         target_shape = mask_pred.shape[-2:]
         if gt_masks.shape[0] > 0:
@@ -444,11 +444,13 @@ def forward(self, feats, img_metas):
             img_metas (list[dict]): List of image information.
 
         Returns:
-            all_cls_scores (Tensor): Classification scores for each\
+            tuple: a tuple contains two elements.
+
+            - | ``all_cls_scores`` (Tensor): Classification scores for each\
                 scale level. Each is a 4D-tensor with shape\
                 (num_decoder, batch_size, num_queries, cls_out_channels).\
                 Note `cls_out_channels` should includes background.
-            all_mask_preds (Tensor): Mask scores for each decoder\
+            - | ``all_mask_preds`` (Tensor): Mask scores for each decoder\
                 layer. Each with shape (num_decoder, batch_size,\
                 num_queries, h, w).
         """
@@ -528,7 +530,7 @@ def forward_train(self,
                 ignored. Defaults to None.
 
         Returns:
-            losses (dict[str, Tensor]): a dictionary of loss components
+            dict[str, Tensor]: a dictionary of loss components
         """
         # not consider ignoring bboxes
         assert gt_bboxes_ignore is None
@@ -607,8 +609,8 @@ def simple_test(self, feats, img_metas, rescale=False):
     def post_process(self, mask_cls, mask_pred):
         """Panoptic segmengation inference.
 
-        This implementation is modified from\
-            https://github.com/facebookresearch/MaskFormer
+        This implementation is modified from `MaskFormer
+        <https://github.com/facebookresearch/MaskFormer>`_.
 
         Args:
             mask_cls (Tensor): Classfication outputs for a image.
@@ -617,7 +619,7 @@ def post_process(self, mask_cls, mask_pred):
                 shape = (num_queries, h, w).
 
         Returns:
-            panoptic_seg (Tensor): panoptic segment result of shape (h, w),\
+            Tensor: panoptic segment result of shape (h, w),\
                 each element in Tensor means:
                 segment_id = _cls + instance_id * INSTANCE_OFFSET.
         """
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
index 17c5d6c895c..73676bcdf50 100644
--- a/mmdet/models/detectors/maskformer.py
+++ b/mmdet/models/detectors/maskformer.py
@@ -7,7 +7,7 @@
 class MaskFormer(SingleStageDetector):
     r"""Implementation of `Per-Pixel Classification is
     NOT All You Need for Semantic Segmentation
-    <https://arxiv.org/pdf/2107.06278>`_"""
+    <https://arxiv.org/pdf/2107.06278>`_."""
 
     def __init__(self,
                  backbone,
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
index f69daf46f9a..76a08db0da7 100644
--- a/mmdet/models/plugins/pixel_decoder.py
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -96,8 +96,8 @@ def forward(self, feats, img_metas):
         Returns:
             tuple: a tuple containing the following:
 
-                - mask_feature (Tensor): Shape (batch_size, c, h, w).
-                - memory (Tensor): Output of last stage of backbone.\
+            - | ``mask_feature`` (Tensor): Shape (batch_size, c, h, w).
+            - | ``memory`` (Tensor): Output of last stage of backbone.\
                     Shape (batch_size, c, h, w).
         """
         y = self.last_feat_conv(feats[-1])
@@ -201,8 +201,8 @@ def forward(self, feats, img_metas):
         Returns:
             tuple: a tuple containing the following:
 
-                - mask_feature (Tensor): shape (batch_size, c, h, w).
-                - memory (Tensor): shape (batch_size, c, h, w).
+            - | ``mask_feature`` (Tensor): shape (batch_size, c, h, w).
+            - | ``memory`` (Tensor): shape (batch_size, c, h, w).
         """
         feat_last = feats[-1]
         bs, c, h, w = feat_last.shape

From fd5ff48c899a631892c9384388606aaca0df0efd Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 04:20:35 +0000
Subject: [PATCH 2/8] update readme

---
 configs/maskformer/README.md | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index ce1384ae77e..2cd09f9585a 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -2,19 +2,7 @@
 
 ## Abstract
 
-Modern approaches typically formulate semantic segmentation as a per-pixel classification
-task, while instance-level segmentation is handled with an alternative mask
-classification. Our key insight: mask classification is sufficiently general to solve
-both semantic- and instance-level segmentation tasks in a unified manner using
-the exact same model, loss, and training procedure. Following this observation,
-we propose MaskFormer, a simple mask classification model which predicts a
-set of binary masks, each associated with a single global class label prediction.
-Overall, the proposed mask classification-based method simplifies the landscape
-of effective approaches to semantic and panoptic segmentation tasks and shows
-excellent empirical results. In particular, we observe that MaskFormer outperforms
-per-pixel classification baselines when the number of classes is large. Our mask
-classification-based method outperforms both current state-of-the-art semantic
-(55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic  segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
 
 <div align=center>
 <img src="https://camo.githubusercontent.com/29fb22298d506ce176caad3006a7b05ef2603ca12cece6c788b7e73c046e8bc9/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d65722e706e67" height="300"/>
@@ -55,6 +43,6 @@ mmdetection
 
 ## Results and Models
 
-| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                         Config                                                         |         Download         | detail |
-| :------: | :-----: | :-----: | :------: | :------------: | :-: | :-: | :-: | :---: | :---: | :---: | :---: | :---: | :---: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------: | :---: |
-| R-50 | pytorch |    75e    |          |                |    |    |    |      |      |      |      |      |      | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) |  | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                         Config                                                         |                                                                                                                                                                 Download                                                                                                                                                                 |                                                                       detail                                                                       |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   75e   |   16.6   |       -       | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |

From 1d48e8e716442d27606695cf8d8bd5a15f8077d7 Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 06:22:25 +0000
Subject: [PATCH 3/8] update readme format

---
 configs/maskformer/README.md | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index 2cd09f9585a..46bb484ed9d 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -1,4 +1,8 @@
-# Per-Pixel Classification is Not All You Need for Semantic Segmentation
+# MaskFormer
+
+> [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+<!-- [ALGORITHM] -->
 
 ## Abstract
 
@@ -8,18 +12,7 @@ Modern approaches typically formulate semantic segmentation as a per-pixel class
 <img src="https://camo.githubusercontent.com/29fb22298d506ce176caad3006a7b05ef2603ca12cece6c788b7e73c046e8bc9/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d65722e706e67" height="300"/>
 </div>
 
-## Citation
-
-```
-@inproceedings{cheng2021maskformer,
-  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
-  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
-  journal={NeurIPS},
-  year={2021}
-}
-```
-
-## Dataset
+## Introduction
 
 MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
 The directory should be like this.
@@ -43,6 +36,17 @@ mmdetection
 
 ## Results and Models
 
-| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                         Config                                                         |                                                                                                                                                                 Download                                                                                                                                                                 |                                                                       detail                                                                       |
-| :------: | :-----: | :-----: | :------: | :------------: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------: |
-|   R-50   | pytorch |   75e   |   16.6   |       -       | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                           Config                                                           |                                                                                                                                                                    Download                                                                                                                                                                     |                                                                         detail                                                                          |
+|:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:--------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|
+|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+
+## Citation
+
+```latex
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```

From 907ef22ecdf3e149b8c5102391453c7ee64a1014 Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 06:48:26 +0000
Subject: [PATCH 4/8] update link

---
 configs/maskformer/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index 46bb484ed9d..9ba8dca5648 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -38,7 +38,7 @@ mmdetection
 
 | Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                           Config                                                           |                                                                                                                                                                    Download                                                                                                                                                                     |                                                                         detail                                                                          |
 |:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:--------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|
-|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
 
 ## Citation
 

From 02a1033b409261d3b47cd9d8d986c62382cb31be Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 07:00:31 +0000
Subject: [PATCH 5/8] update json link

---
 configs/maskformer/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index 9ba8dca5648..54110004c94 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -38,7 +38,7 @@ mmdetection
 
 | Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                           Config                                                           |                                                                                                                                                                    Download                                                                                                                                                                     |                                                                         detail                                                                          |
 |:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:--------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|
-|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
 
 ## Citation
 

From ba1edb4d3e1803c4b267edd9561dc2466eab1480 Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 07:18:42 +0000
Subject: [PATCH 6/8] update format of ConfigDict

---
 .../bbox/assigners/mask_hungarian_assigner.py |  6 +++---
 mmdet/models/dense_heads/maskformer_head.py   | 20 +++++++++----------
 mmdet/models/plugins/pixel_decoder.py         | 20 +++++++++----------
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
index ef0f35831d6..d10e62edb84 100644
--- a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
+++ b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
@@ -29,9 +29,9 @@ class MaskHungarianAssigner(BaseAssigner):
     - positive integer: positive sample, index (1-based) of assigned gt
 
     Args:
-        cls_cost (obj:`mmcv.ConfigDict` | dict): Classification cost config.
-        mask_cost (obj:`mmcv.ConfigDict` | dict): Mask cost config.
-        dice_cost (obj:`mmcv.ConfigDict` | dict): Dice cost config.
+        cls_cost (:obj:`mmcv.ConfigDict` | dict): Classification cost config.
+        mask_cost (:obj:`mmcv.ConfigDict` | dict): Mask cost config.
+        dice_cost (:obj:`mmcv.ConfigDict` | dict): Dice cost config.
     """
 
     def __init__(self,
diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
index 2234b25a1a0..347a555774c 100644
--- a/mmdet/models/dense_heads/maskformer_head.py
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -28,24 +28,24 @@ class MaskFormerHead(AnchorFreeHead):
         num_things_classes (int): Number of things.
         num_stuff_classes (int): Number of stuff.
         num_queries (int): Number of query in Transformer.
-        pixel_decoder (obj:`mmcv.ConfigDict` | dict): Config for pixel decoder.
-            Defaults to None.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
         enforce_decoder_input_project (bool, optional): Whether to add a layer
             to change the embed_dim of tranformer encoder in pixel decoder to
             the embed_dim of transformer decoder. Defaults to False.
-        transformer_decoder (obj:`mmcv.ConfigDict` | dict): Config for
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder. Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict` | dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder position encoding. Defaults to None.
-        loss_cls (obj:`mmcv.ConfigDict` | dict): Config of the classification
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
             loss. Defaults to `CrossEntropyLoss`.
-        loss_mask (obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
             Defaults to `FocalLoss`.
-        loss_dice (obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
             Defaults to `DiceLoss`.
-        train_cfg (obj:`mmcv.ConfigDict` | dict): Training config of Maskformer
-            head.
-        test_cfg (obj:`mmcv.ConfigDict` | dict): Testing config of Maskformer
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Maskformer head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of Maskformer
             head.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to None.
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
index 76a08db0da7..4b1ff29fcc1 100644
--- a/mmdet/models/plugins/pixel_decoder.py
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -17,17 +17,17 @@ class PixelDecoder(BaseModule):
             input feature maps.
         feat_channels (int): Number channels for feature.
         out_channels (int): Number channels for output.
-        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
             Defaults to dict(type='GN', num_groups=32).
-        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
             Defaults to dict(type='ReLU').
-        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
             encoder.Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer encoder position encoding. Defaults to
             dict(type='SinePositionalEncoding', num_feats=128,
             normalize=True).
-        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
             Default: None
     """
 
@@ -122,17 +122,17 @@ class TransformerEncoderPixelDecoder(PixelDecoder):
             input feature maps.
         feat_channels (int): Number channels for feature.
         out_channels (int): Number channels for output.
-        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
             Defaults to dict(type='GN', num_groups=32).
-        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
             Defaults to dict(type='ReLU').
-        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
             encoder.Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer encoder position encoding. Defaults to
             dict(type='SinePositionalEncoding', num_feats=128,
             normalize=True).
-        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
             Default: None
     """
 

From 28cbca4e38a261b5194ed5b2f294d633871a91bb Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 07:34:06 +0000
Subject: [PATCH 7/8] update format of function returns

---
 mmdet/models/dense_heads/maskformer_head.py | 72 ++++++++++-----------
 mmdet/models/plugins/pixel_decoder.py       | 12 ++--
 2 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
index 347a555774c..7d7a644c7e1 100644
--- a/mmdet/models/dense_heads/maskformer_head.py
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -177,12 +177,11 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs):
 
         Returns:
             tuple: a tuple containing the following targets.
-
-            - | ``labels`` (list[Tensor]): Ground truth class indices\
-                for all images. Each with shape (n, ), n is the sum of\
-                number of stuff type and number of instance in a image.
-            - | ``masks`` (list[Tensor]): Ground truth mask for each\
-                image, each with shape (n, h, w).
+                - labels (list[Tensor]): Ground truth class indices\
+                    for all images. Each with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (list[Tensor]): Ground truth mask for each\
+                    image, each with shape (n, h, w).
         """
         num_things_list = [self.num_things_classes] * len(gt_labels_list)
         num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list)
@@ -213,19 +212,18 @@ def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
 
         Returns:
             tuple[list[Tensor]]: a tuple containing the following targets.
-
-            - | ``labels_list`` (list[Tensor]): Labels of all images.\
-                Each with shape (num_queries, ).
-            - | ``label_weights_list`` (list[Tensor]): Label weights\
-                of all images. Each with shape (num_queries, ).
-            - | ``mask_targets_list`` (list[Tensor]): Mask targets of\
-                all images. Each with shape (num_queries, h, w).
-            - | ``mask_weights_list`` (list[Tensor]): Mask weights of\
-                all images. Each with shape (num_queries, ).
-            - | ``num_total_pos`` (int): Number of positive samples in\
-                all images.
-            - | ``num_total_neg`` (int): Number of negative samples in\
-                all images.
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - num_total_pos (int): Number of positive samples in\
+                    all images.
+                - num_total_neg (int): Number of negative samples in\
+                    all images.
         """
         (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
          pos_inds_list,
@@ -256,17 +254,16 @@ def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
 
         Returns:
             tuple[Tensor]: a tuple containing the following for one image.
-
-            - | ``labels`` (Tensor): Labels of each image.
-                shape (num_queries, ).
-            - | ``label_weights`` (Tensor): Label weights of each image.
-                shape (num_queries, ).
-            - | ``mask_targets`` (Tensor): Mask targets of each image.
-                shape (num_queries, h, w).
-            - | ``mask_weights`` (Tensor): Mask weights of each image.
-                shape (num_queries, ).
-            - | ``pos_inds`` (Tensor): Sampled positive indices for each image.
-            - | ``neg_inds`` (Tensor): Sampled negative indices for each image.
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
         """
         target_shape = mask_pred.shape[-2:]
         if gt_masks.shape[0] > 0:
@@ -445,14 +442,13 @@ def forward(self, feats, img_metas):
 
         Returns:
             tuple: a tuple contains two elements.
-
-            - | ``all_cls_scores`` (Tensor): Classification scores for each\
-                scale level. Each is a 4D-tensor with shape\
-                (num_decoder, batch_size, num_queries, cls_out_channels).\
-                Note `cls_out_channels` should includes background.
-            - | ``all_mask_preds`` (Tensor): Mask scores for each decoder\
-                layer. Each with shape (num_decoder, batch_size,\
-                num_queries, h, w).
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
         """
         batch_size = len(img_metas)
         input_img_h, input_img_w = img_metas[0]['batch_input_shape']
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
index 4b1ff29fcc1..d1193551ddd 100644
--- a/mmdet/models/plugins/pixel_decoder.py
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -95,10 +95,9 @@ def forward(self, feats, img_metas):
 
         Returns:
             tuple: a tuple containing the following:
-
-            - | ``mask_feature`` (Tensor): Shape (batch_size, c, h, w).
-            - | ``memory`` (Tensor): Output of last stage of backbone.\
-                    Shape (batch_size, c, h, w).
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                        Shape (batch_size, c, h, w).
         """
         y = self.last_feat_conv(feats[-1])
         for i in range(self.num_inputs - 2, -1, -1):
@@ -200,9 +199,8 @@ def forward(self, feats, img_metas):
 
         Returns:
             tuple: a tuple containing the following:
-
-            - | ``mask_feature`` (Tensor): shape (batch_size, c, h, w).
-            - | ``memory`` (Tensor): shape (batch_size, c, h, w).
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
         """
         feat_last = feats[-1]
         bs, c, h, w = feat_last.shape

From f0a6eaad4ced75afd609f12b120eacbff7adf0e3 Mon Sep 17 00:00:00 2001
From: luochunhua <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 07:40:29 +0000
Subject: [PATCH 8/8] uncomment main in deployment/test.py

---
 tools/deployment/onnx2tensorrt.py | 12 ++++++++++++
 tools/deployment/pytorch2onnx.py  | 12 ++++++++++++
 tools/deployment/test.py          | 13 +++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/tools/deployment/onnx2tensorrt.py b/tools/deployment/onnx2tensorrt.py
index e3e9b57d2b4..b59e52ae199 100644
--- a/tools/deployment/onnx2tensorrt.py
+++ b/tools/deployment/onnx2tensorrt.py
@@ -252,3 +252,15 @@ def parse_shape(shape):
         show=args.show,
         workspace_size=args.workspace_size,
         verbose=args.verbose)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/pytorch2onnx.py b/tools/deployment/pytorch2onnx.py
index c1789b442a7..5c786f8540e 100644
--- a/tools/deployment/pytorch2onnx.py
+++ b/tools/deployment/pytorch2onnx.py
@@ -343,3 +343,15 @@ def parse_args():
         do_simplify=args.simplify,
         dynamic_export=args.dynamic_export,
         skip_postprocess=args.skip_postprocess)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/test.py b/tools/deployment/test.py
index b32b77332e5..2daf8866e58 100644
--- a/tools/deployment/test.py
+++ b/tools/deployment/test.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import warnings
 
 import mmcv
 from mmcv import Config, DictAction
@@ -141,3 +142,15 @@ def main():
 
 if __name__ == '__main__':
     main()
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)