From 988574fe6c9bf3483b0fecbc16bbe747ae0f88b6 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Tue, 30 Mar 2021 15:27:28 +0800
Subject: [PATCH] add coments for rcnn (#2461)

---
 .../_base_/cascade_mask_rcnn_r50_fpn.yml      |   4 +-
 .../_base_/cascade_rcnn_r50_fpn.yml           |   2 +-
 .../_base_/faster_rcnn_r50_fpn.yml            |   2 +-
 .../cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml  |   4 +-
 .../gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml    |   2 +-
 configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml |   2 +-
 configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml   |   4 +-
 .../hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml |   2 +-
 configs/mask_rcnn/_base_/mask_rcnn_r50.yml    |   2 +-
 .../mask_rcnn/_base_/mask_rcnn_r50_fpn.yml    |   4 +-
 ppdet/modeling/heads/bbox_head.py             |  84 +++++++------
 ppdet/modeling/heads/cascade_head.py          |  75 ++++++++----
 ppdet/modeling/heads/mask_head.py             |  56 ++++++---
 ppdet/modeling/heads/roi_extractor.py         |  25 ++++
 ppdet/modeling/heads/rpn_head.py              | 115 ------------------
 ppdet/modeling/necks/fpn.py                   |  34 +++++-
 .../proposal_generator/anchor_generator.py    |  18 +++
 .../proposal_generator/proposal_generator.py  |  22 ++++
 ppdet/modeling/proposal_generator/rpn_head.py |  28 ++++-
 ppdet/modeling/proposal_generator/target.py   |   7 ++
 .../proposal_generator/target_layer.py        |  67 +++++++++-
 21 files changed, 345 insertions(+), 214 deletions(-)
 delete mode 100644 ppdet/modeling/heads/rpn_head.py

diff --git a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml
index 8fef452e859eb..ea2937babd488 100644
--- a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml
+++ b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml
@@ -64,7 +64,7 @@ BBoxAssigner:
   use_random: True
 
 CascadeTwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024
 
 BBoxPostProcess:
   decode:
@@ -88,7 +88,7 @@ MaskHead:
 
 MaskFeat:
   num_convs: 4
-  out_channels: 256
+  out_channel: 256
 
 MaskAssigner:
   mask_resolution: 28
diff --git a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml
index 51905687ebbd1..c5afe77434720 100644
--- a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml
+++ b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml
@@ -62,7 +62,7 @@ BBoxAssigner:
   use_random: True
 
 CascadeTwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024
 
 BBoxPostProcess:
   decode:
diff --git a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
index aa5f5b28811e2..38ee81def0cb5 100644
--- a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
+++ b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
@@ -61,7 +61,7 @@ BBoxAssigner:
   use_random: True
 
 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024
 
 
 BBoxPostProcess:
diff --git a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml
index 1281148da634b..e2c750dfbe481 100644
--- a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml
@@ -31,7 +31,7 @@ CascadeHead:
 
 CascadeXConvNormHead:
   num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
   norm_type: gn
 
 MaskHead:
@@ -45,7 +45,7 @@ MaskHead:
 
 MaskFeat:
   num_convs: 4
-  out_channels: 256
+  out_channel: 256
   norm_type: gn
 
 
diff --git a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml
index 701b8306ec2e4..2706790ed7730 100644
--- a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml
@@ -21,7 +21,7 @@ CascadeHead:
 
 CascadeXConvNormHead:
   num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
   norm_type: gn
 
 
diff --git a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml
index e8eb567934790..200a98b4b9fb6 100644
--- a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml
@@ -29,7 +29,7 @@ BBoxHead:
 
 XConvNormHead:
   num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
   norm_type: gn
 
 
diff --git a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml
index 2104fa901e194..70beaf5851df9 100644
--- a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml
@@ -31,7 +31,7 @@ BBoxHead:
 
 XConvNormHead:
   num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
   norm_type: gn
 
 MaskHead:
@@ -45,7 +45,7 @@ MaskHead:
 
 MaskFeat:
   num_convs: 4
-  out_channels: 256
+  out_channel: 256
   norm_type: gn
 
 
diff --git a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml
index cf6645d1e338c..6c556f306fdc2 100644
--- a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml
+++ b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml
@@ -57,7 +57,7 @@ BBoxAssigner:
   use_random: True
 
 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024
 
 BBoxPostProcess:
   decode: RCNNBox
diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
index aa6e0db56920c..04dab63701171 100644
--- a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
+++ b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
@@ -78,7 +78,7 @@ MaskHead:
 
 MaskFeat:
   num_convs: 0
-  out_channels: 256
+  out_channel: 256
 
 MaskAssigner:
   mask_resolution: 14
diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
index 74004b2810963..dd7587669661a 100644
--- a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
+++ b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
@@ -61,7 +61,7 @@ BBoxAssigner:
   use_random: True
 
 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024
 
 BBoxPostProcess:
   decode: RCNNBox
@@ -82,7 +82,7 @@ MaskHead:
 
 MaskFeat:
   num_convs: 4
-  out_channels: 256
+  out_channel: 256
 
 MaskAssigner:
   mask_resolution: 28
diff --git a/ppdet/modeling/heads/bbox_head.py b/ppdet/modeling/heads/bbox_head.py
index a6480961cd1b6..0c75f8f1d9deb 100644
--- a/ppdet/modeling/heads/bbox_head.py
+++ b/ppdet/modeling/heads/bbox_head.py
@@ -31,31 +31,40 @@
 
 @register
 class TwoFCHead(nn.Layer):
-    def __init__(self, in_dim=256, mlp_dim=1024, resolution=7):
+    """
+    RCNN bbox head with Two fc layers to extract feature
+
+    Args:
+        in_channel (int): Input channel which can be derived by from_config
+        out_channel (int): Output channel
+        resolution (int): Resolution of input feature map, default 7
+    """
+
+    def __init__(self, in_channel=256, out_channel=1024, resolution=7):
         super(TwoFCHead, self).__init__()
-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
-        fan = in_dim * resolution * resolution
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        fan = in_channel * resolution * resolution
         self.fc6 = nn.Linear(
-            in_dim * resolution * resolution,
-            mlp_dim,
+            in_channel * resolution * resolution,
+            out_channel,
             weight_attr=paddle.ParamAttr(
                 initializer=XavierUniform(fan_out=fan)))
 
         self.fc7 = nn.Linear(
-            mlp_dim,
-            mlp_dim,
+            out_channel,
+            out_channel,
             weight_attr=paddle.ParamAttr(initializer=XavierUniform()))
 
     @classmethod
     def from_config(cls, cfg, input_shape):
         s = input_shape
         s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}
 
     @property
     def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]
 
     def forward(self, rois_feat):
         rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
@@ -68,34 +77,36 @@ def forward(self, rois_feat):
 
 @register
 class XConvNormHead(nn.Layer):
+    __shared__ = ['norm_type', 'freeze_norm']
     """
     RCNN bbox head with serveral convolution layers
+
     Args:
-        in_dim(int): num of channels for the input rois_feat
-        num_convs(int): num of convolution layers for the rcnn bbox head
-        conv_dim(int): num of channels for the conv layers
-        mlp_dim(int): num of channels for the fc layers
-        resolution(int): resolution of the rois_feat
-        norm_type(str): norm type, 'gn' by defalut
-        freeze_norm(bool): whether to freeze the norm
-        stage_name(str): used in CascadeXConvNormHead, '' by default
+        in_channel (int): Input channels which can be derived by from_config
+        num_convs (int): The number of conv layers
+        conv_dim (int): The number of channels for the conv layers
+        out_channel (int): Output channels
+        resolution (int): Resolution of input feature map
+        norm_type (string): Norm type, bn, gn, sync_bn are available, 
+            default `gn`
+        freeze_norm (bool): Whether to freeze the norm
+        stage_name (string): Prefix name for conv layer,  '' by default
     """
-    __shared__ = ['norm_type', 'freeze_norm']
 
     def __init__(self,
-                 in_dim=256,
+                 in_channel=256,
                  num_convs=4,
                  conv_dim=256,
-                 mlp_dim=1024,
+                 out_channel=1024,
                  resolution=7,
                  norm_type='gn',
                  freeze_norm=False,
                  stage_name=''):
         super(XConvNormHead, self).__init__()
-        self.in_dim = in_dim
+        self.in_channel = in_channel
         self.num_convs = num_convs
         self.conv_dim = conv_dim
-        self.mlp_dim = mlp_dim
+        self.out_channel = out_channel
         self.norm_type = norm_type
         self.freeze_norm = freeze_norm
 
@@ -103,7 +114,7 @@ def __init__(self,
         fan = conv_dim * 3 * 3
         initializer = KaimingNormal(fan_in=fan)
         for i in range(self.num_convs):
-            in_c = in_dim if i == 0 else conv_dim
+            in_c = in_channel if i == 0 else conv_dim
             head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)
             head_conv = self.add_sublayer(
                 head_conv_name,
@@ -122,7 +133,7 @@ def __init__(self,
         fan = conv_dim * resolution * resolution
         self.fc6 = nn.Linear(
             conv_dim * resolution * resolution,
-            mlp_dim,
+            out_channel,
             weight_attr=paddle.ParamAttr(
                 initializer=XavierUniform(fan_out=fan)),
             bias_attr=paddle.ParamAttr(
@@ -132,11 +143,11 @@ def __init__(self,
     def from_config(cls, cfg, input_shape):
         s = input_shape
         s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}
 
     @property
     def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]
 
     def forward(self, rois_feat):
         for i in range(self.num_convs):
@@ -151,14 +162,17 @@ class BBoxHead(nn.Layer):
     __shared__ = ['num_classes']
     __inject__ = ['bbox_assigner']
     """
-    head (nn.Layer): Extract feature in bbox head
-    in_channel (int): Input channel after RoI extractor
-    roi_extractor (object): The module of RoI Extractor
-    bbox_assigner (object): The module of Box Assigner, label and sample the 
-                            box.
-    with_pool (bool): Whether to use pooling for the RoI feature.
-    num_classes (int): The number of classes
-    bbox_weight (List[float]): The weight to get the decode box 
+    RCNN bbox head
+
+    Args:
+        head (nn.Layer): Extract feature in bbox head
+        in_channel (int): Input channel after RoI extractor
+        roi_extractor (object): The module of RoI Extractor
+        bbox_assigner (object): The module of Box Assigner, label and sample the 
+            box.
+        with_pool (bool): Whether to use pooling for the RoI feature.
+        num_classes (int): The number of classes
+        bbox_weight (List[float]): The weight to get the decode box 
     """
 
     def __init__(self,
diff --git a/ppdet/modeling/heads/cascade_head.py b/ppdet/modeling/heads/cascade_head.py
index 99c43c83e9023..0ee23c040ff51 100644
--- a/ppdet/modeling/heads/cascade_head.py
+++ b/ppdet/modeling/heads/cascade_head.py
@@ -32,32 +32,41 @@
 @register
 class CascadeTwoFCHead(nn.Layer):
     __shared__ = ['num_cascade_stage']
+    """
+    Cascade RCNN bbox head  with Two fc layers to extract feature
+
+    Args:
+        in_channel (int): Input channel which can be derived by from_config
+        out_channel (int): Output channel
+        resolution (int): Resolution of input feature map, default 7
+        num_cascade_stage (int): The number of cascade stage, default 3
+    """
 
     def __init__(self,
-                 in_dim=256,
-                 mlp_dim=1024,
+                 in_channel=256,
+                 out_channel=1024,
                  resolution=7,
                  num_cascade_stage=3):
         super(CascadeTwoFCHead, self).__init__()
 
-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
+        self.in_channel = in_channel
+        self.out_channel = out_channel
 
         self.head_list = []
         for stage in range(num_cascade_stage):
             head_per_stage = self.add_sublayer(
-                str(stage), TwoFCHead(in_dim, mlp_dim, resolution))
+                str(stage), TwoFCHead(in_channel, out_channel, resolution))
             self.head_list.append(head_per_stage)
 
     @classmethod
     def from_config(cls, cfg, input_shape):
         s = input_shape
         s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}
 
     @property
     def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]
 
     def forward(self, rois_feat, stage=0):
         out = self.head_list[stage](rois_feat)
@@ -67,29 +76,43 @@ def forward(self, rois_feat, stage=0):
 @register
 class CascadeXConvNormHead(nn.Layer):
     __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']
+    """
+    Cascade RCNN bbox head with serveral convolution layers
+
+    Args:
+        in_channel (int): Input channels which can be derived by from_config
+        num_convs (int): The number of conv layers
+        conv_dim (int): The number of channels for the conv layers
+        out_channel (int): Output channels
+        resolution (int): Resolution of input feature map
+        norm_type (string): Norm type, bn, gn, sync_bn are available, 
+            default `gn`
+        freeze_norm (bool): Whether to freeze the norm
+        num_cascade_stage (int): The number of cascade stage, default 3
+    """
 
     def __init__(self,
-                 in_dim=256,
+                 in_channel=256,
                  num_convs=4,
                  conv_dim=256,
-                 mlp_dim=1024,
+                 out_channel=1024,
                  resolution=7,
                  norm_type='gn',
                  freeze_norm=False,
                  num_cascade_stage=3):
         super(CascadeXConvNormHead, self).__init__()
-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
+        self.in_channel = in_channel
+        self.out_channel = out_channel
 
         self.head_list = []
         for stage in range(num_cascade_stage):
             head_per_stage = self.add_sublayer(
                 str(stage),
                 XConvNormHead(
-                    in_dim,
+                    in_channel,
                     num_convs,
                     conv_dim,
-                    mlp_dim,
+                    out_channel,
                     resolution,
                     norm_type,
                     freeze_norm,
@@ -100,11 +123,11 @@ def __init__(self,
     def from_config(cls, cfg, input_shape):
         s = input_shape
         s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}
 
     @property
     def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]
 
     def forward(self, rois_feat, stage=0):
         out = self.head_list[stage](rois_feat)
@@ -116,16 +139,18 @@ class CascadeHead(BBoxHead):
     __shared__ = ['num_classes', 'num_cascade_stages']
     __inject__ = ['bbox_assigner']
     """
-    head (nn.Layer): Extract feature in bbox head
-    in_channel (int): Input channel after RoI extractor
-    roi_extractor (object): The module of RoI Extractor
-    bbox_assigner (object): The module of Box Assigner, label and sample the 
-                            box.
-    num_classes (int): The number of classes
-    bbox_weight (List[List[float]]): The weight to get the decode box and the 
-                                     length of weight is the number of cascade 
-                                     stage
-    num_cascade_stages (int): THe number of stage to refine the box
+    Cascade RCNN bbox head
+
+    Args:
+        head (nn.Layer): Extract feature in bbox head
+        in_channel (int): Input channel after RoI extractor
+        roi_extractor (object): The module of RoI Extractor
+        bbox_assigner (object): The module of Box Assigner, label and sample the 
+            box.
+        num_classes (int): The number of classes
+        bbox_weight (List[List[float]]): The weight to get the decode box and the 
+            length of weight is the number of cascade stage
+        num_cascade_stages (int): THe number of stage to refine the box
     """
 
     def __init__(self,
diff --git a/ppdet/modeling/heads/mask_head.py b/ppdet/modeling/heads/mask_head.py
index dc624ff838e8b..eea70922a483e 100644
--- a/ppdet/modeling/heads/mask_head.py
+++ b/ppdet/modeling/heads/mask_head.py
@@ -27,18 +27,29 @@
 
 @register
 class MaskFeat(nn.Layer):
+    """
+    Feature extraction in Mask head
+
+    Args:
+        in_channel (int): Input channels
+        out_channel (int): Output channels
+        num_convs (int): The number of conv layers, default 4
+        norm_type (string | None): Norm type, bn, gn, sync_bn are available,
+            default None
+    """
+
     def __init__(self,
+                 in_channel=256,
+                 out_channel=256,
                  num_convs=4,
-                 in_channels=256,
-                 out_channels=256,
                  norm_type=None):
         super(MaskFeat, self).__init__()
         self.num_convs = num_convs
-        self.in_channels = in_channels
-        self.out_channels = out_channels
+        self.in_channel = in_channel
+        self.out_channel = out_channel
         self.norm_type = norm_type
-        fan_conv = out_channels * 3 * 3
-        fan_deconv = out_channels * 2 * 2
+        fan_conv = out_channel * 3 * 3
+        fan_deconv = out_channel * 2 * 2
 
         mask_conv = nn.Sequential()
         if norm_type == 'gn':
@@ -47,8 +58,8 @@ def __init__(self,
                 mask_conv.add_sublayer(
                     conv_name,
                     ConvNormLayer(
-                        ch_in=in_channels if i == 0 else out_channels,
-                        ch_out=out_channels,
+                        ch_in=in_channel if i == 0 else out_channel,
+                        ch_out=out_channel,
                         filter_size=3,
                         stride=1,
                         norm_type=self.norm_type,
@@ -62,8 +73,8 @@ def __init__(self,
                 mask_conv.add_sublayer(
                     conv_name,
                     nn.Conv2D(
-                        in_channels=in_channels if i == 0 else out_channels,
-                        out_channels=out_channels,
+                        in_channels=in_channel if i == 0 else out_channel,
+                        out_channels=out_channel,
                         kernel_size=3,
                         padding=1,
                         weight_attr=paddle.ParamAttr(
@@ -72,8 +83,8 @@ def __init__(self,
         mask_conv.add_sublayer(
             'conv5_mask',
             nn.Conv2DTranspose(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
+                in_channels=self.in_channel,
+                out_channels=self.out_channel,
                 kernel_size=2,
                 stride=2,
                 weight_attr=paddle.ParamAttr(
@@ -85,10 +96,10 @@ def __init__(self,
     def from_config(cls, cfg, input_shape):
         if isinstance(input_shape, (list, tuple)):
             input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels, }
+        return {'in_channel': input_shape.channels, }
 
-    def out_channel(self):
-        return self.out_channels
+    def out_channels(self):
+        return self.out_channel
 
     def forward(self, feats):
         return self.upsample(feats)
@@ -98,6 +109,18 @@ def forward(self, feats):
 class MaskHead(nn.Layer):
     __shared__ = ['num_classes']
     __inject__ = ['mask_assigner']
+    """
+    RCNN mask head
+
+    Args:
+        head (nn.Layer): Extract feature in mask head
+        roi_extractor (object): The module of RoI Extractor
+        mask_assigner (object): The module of Mask Assigner, 
+            label and sample the mask
+        num_classes (int): The number of classes
+        share_bbox_feat (bool): Whether to share the feature from bbox head,
+            default false
+    """
 
     def __init__(self,
                  head,
@@ -112,7 +135,7 @@ def __init__(self,
         if isinstance(roi_extractor, dict):
             self.roi_extractor = RoIAlign(**roi_extractor)
         self.head = head
-        self.in_channels = head.out_channel()
+        self.in_channels = head.out_channels()
         self.mask_assigner = mask_assigner
         self.share_bbox_feat = share_bbox_feat
         self.bbox_head = None
@@ -159,7 +182,6 @@ def forward_train(self, body_feats, rois, rois_num, inputs, targets,
         rois_num (Tensor): The number of proposals for each batch
         inputs (dict): ground truth info
         """
-        #assert self.bbox_head
         tgt_labels, _, tgt_gt_inds = targets
         rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
             rois, tgt_labels, tgt_gt_inds, inputs)
diff --git a/ppdet/modeling/heads/roi_extractor.py b/ppdet/modeling/heads/roi_extractor.py
index 1e2f658a7b1a6..35c3924e36c60 100644
--- a/ppdet/modeling/heads/roi_extractor.py
+++ b/ppdet/modeling/heads/roi_extractor.py
@@ -25,6 +25,31 @@ def _to_list(v):
 
 @register
 class RoIAlign(object):
+    """
+    RoI Align module
+
+    For more details, please refer to the document of roi_align in
+    in ppdet/modeing/ops.py
+
+    Args:
+        resolution (int): The output size, default 14
+        spatial_scale (float): Multiplicative spatial scale factor to translate
+            ROI coords from their input scale to the scale used when pooling.
+            default 0.0625
+        sampling_ratio (int): The number of sampling points in the interpolation
+            grid, default 0
+        canconical_level (int): The referring level of FPN layer with 
+            specified level. default 4
+        canonical_size (int): The referring scale of FPN layer with 
+            specified scale. default 224
+        start_level (int): The start level of FPN layer to extract RoI feature,
+            default 0
+        end_level (int): The end level of FPN layer to extract RoI feature,
+            default 3
+        aligned (bool): Whether to add offset to rois' coord in roi_align.
+            default false
+    """
+
     def __init__(self,
                  resolution=14,
                  spatial_scale=0.0625,
diff --git a/ppdet/modeling/heads/rpn_head.py b/ppdet/modeling/heads/rpn_head.py
deleted file mode 100644
index 64f7acc495326..0000000000000
--- a/ppdet/modeling/heads/rpn_head.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal
-from paddle.regularizer import L2Decay
-from paddle.nn import Conv2D
-
-from ppdet.core.workspace import register
-from ppdet.modeling import ops
-
-
-@register
-class RPNFeat(nn.Layer):
-    def __init__(self, feat_in=1024, feat_out=1024):
-        super(RPNFeat, self).__init__()
-        # rpn feat is shared with each level
-        self.rpn_conv = Conv2D(
-            in_channels=feat_in,
-            out_channels=feat_out,
-            kernel_size=3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    def forward(self, inputs, feats):
-        rpn_feats = []
-        for feat in feats:
-            rpn_feats.append(F.relu(self.rpn_conv(feat)))
-        return rpn_feats
-
-
-@register
-class RPNHead(nn.Layer):
-    __inject__ = ['rpn_feat']
-
-    def __init__(self, rpn_feat, anchor_per_position=15, rpn_channel=1024):
-        super(RPNHead, self).__init__()
-        self.rpn_feat = rpn_feat
-        if isinstance(rpn_feat, dict):
-            self.rpn_feat = RPNFeat(**rpn_feat)
-        # rpn head is shared with each level
-        # rpn roi classification scores
-        self.rpn_rois_score = Conv2D(
-            in_channels=rpn_channel,
-            out_channels=anchor_per_position,
-            kernel_size=1,
-            padding=0,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-        # rpn roi bbox regression deltas
-        self.rpn_rois_delta = Conv2D(
-            in_channels=rpn_channel,
-            out_channels=4 * anchor_per_position,
-            kernel_size=1,
-            padding=0,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    def forward(self, inputs, feats):
-        rpn_feats = self.rpn_feat(inputs, feats)
-        rpn_head_out = []
-        for rpn_feat in rpn_feats:
-            rrs = self.rpn_rois_score(rpn_feat)
-            rrd = self.rpn_rois_delta(rpn_feat)
-            rpn_head_out.append((rrs, rrd))
-        return rpn_feats, rpn_head_out
-
-    def get_loss(self, loss_inputs):
-        # cls loss
-        score_tgt = paddle.cast(
-            x=loss_inputs['rpn_score_target'], dtype='float32')
-        score_tgt.stop_gradient = True
-        loss_rpn_cls = ops.sigmoid_cross_entropy_with_logits(
-            input=loss_inputs['rpn_score_pred'], label=score_tgt)
-        loss_rpn_cls = paddle.mean(loss_rpn_cls, name='loss_rpn_cls')
-
-        # reg loss
-        loc_tgt = paddle.cast(x=loss_inputs['rpn_rois_target'], dtype='float32')
-        loc_tgt.stop_gradient = True
-        loss_rpn_reg = ops.smooth_l1(
-            input=loss_inputs['rpn_rois_pred'],
-            label=loc_tgt,
-            inside_weight=loss_inputs['rpn_rois_weight'],
-            outside_weight=loss_inputs['rpn_rois_weight'],
-            sigma=3.0, )
-        loss_rpn_reg = paddle.sum(loss_rpn_reg)
-        score_shape = paddle.shape(score_tgt)
-        score_shape = paddle.cast(score_shape, dtype='float32')
-        norm = paddle.prod(score_shape)
-        norm.stop_gradient = True
-        loss_rpn_reg = loss_rpn_reg / norm
-
-        return {'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_reg': loss_rpn_reg}
diff --git a/ppdet/modeling/necks/fpn.py b/ppdet/modeling/necks/fpn.py
index 85767bb105dd4..0b9f6a798bdc0 100644
--- a/ppdet/modeling/necks/fpn.py
+++ b/ppdet/modeling/necks/fpn.py
@@ -29,6 +29,34 @@
 @register
 @serializable
 class FPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channel (list[int]): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
     def __init__(self,
                  in_channels,
                  out_channel,
@@ -67,7 +95,7 @@ def __init__(self,
             else:
                 lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
             in_c = in_channels[i - st_stage]
-            if self.norm_type == 'gn':
+            if self.norm_type is not None:
                 lateral = self.add_sublayer(
                     lateral_name,
                     ConvNormLayer(
@@ -93,7 +121,7 @@ def __init__(self,
             self.lateral_convs.append(lateral)
 
             fpn_name = 'fpn_res{}_sum'.format(i + 2)
-            if self.norm_type == 'gn':
+            if self.norm_type is not None:
                 fpn_conv = self.add_sublayer(
                     fpn_name,
                     ConvNormLayer(
@@ -128,7 +156,7 @@ def __init__(self,
                 else:
                     in_c = out_channel
                 extra_fpn_name = 'fpn_{}'.format(lvl + 2)
-                if self.norm_type == 'gn':
+                if self.norm_type is not None:
                     extra_fpn_conv = self.add_sublayer(
                         extra_fpn_name,
                         ConvNormLayer(
diff --git a/ppdet/modeling/proposal_generator/anchor_generator.py b/ppdet/modeling/proposal_generator/anchor_generator.py
index 1ca0319d3ad13..8088ffa04affa 100644
--- a/ppdet/modeling/proposal_generator/anchor_generator.py
+++ b/ppdet/modeling/proposal_generator/anchor_generator.py
@@ -25,6 +25,24 @@
 
 @register
 class AnchorGenerator(nn.Layer):
+    """
+    Generate anchors according to the feature maps
+
+    Args:
+        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at 
+            each feature point. list[float] means all feature levels share the 
+            same sizes. list[list[float]] means the anchor sizes for 
+            each level. The sizes stand for the scale of input size.
+        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at
+            each feature point. list[float] means all feature levels share the
+            same ratios. list[list[float]] means the aspect ratios for
+            each level.
+        strides (list[float]): The strides of feature maps which generate 
+            anchors
+        offset (float): The offset of the coordinate of anchors, default 0.
+        
+    """
+
     def __init__(self,
                  anchor_sizes=[32, 64, 128, 256, 512],
                  aspect_ratios=[0.5, 1.0, 2.0],
diff --git a/ppdet/modeling/proposal_generator/proposal_generator.py b/ppdet/modeling/proposal_generator/proposal_generator.py
index 8a5df53255d08..12518e4881723 100644
--- a/ppdet/modeling/proposal_generator/proposal_generator.py
+++ b/ppdet/modeling/proposal_generator/proposal_generator.py
@@ -25,6 +25,28 @@
 @register
 @serializable
 class ProposalGenerator(object):
+    """
+    Proposal generation module
+
+    For more details, please refer to the document of generate_proposals 
+    in ppdet/modeing/ops.py
+
+    Args:
+        pre_nms_top_n (int): Number of total bboxes to be kept per
+            image before NMS. default 6000
+        post_nms_top_n (int): Number of total bboxes to be kept per
+            image after NMS. default 1000
+        nms_thresh (float): Threshold in NMS. default 0.5
+        min_size (flaot): Remove predicted boxes with either height or
+             width < min_size. default 0.1
+        eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
+             `adaptive_threshold = adaptive_threshold * eta` in each iteration.
+             default 1.
+        topk_after_collect (bool): whether to adopt topk after batch 
+             collection. If topk_after_collect is true, box filter will not be 
+             used after NMS at each image in proposal generation. default false
+    """
+
     def __init__(self,
                  pre_nms_top_n=12000,
                  post_nms_top_n=2000,
diff --git a/ppdet/modeling/proposal_generator/rpn_head.py b/ppdet/modeling/proposal_generator/rpn_head.py
index 6a1c980a452f0..2b1e6c77b7cb3 100644
--- a/ppdet/modeling/proposal_generator/rpn_head.py
+++ b/ppdet/modeling/proposal_generator/rpn_head.py
@@ -27,12 +27,20 @@
 
 
 class RPNFeat(nn.Layer):
-    def __init__(self, feat_in=1024, feat_out=1024):
+    """
+    Feature extraction in RPN head
+
+    Args:
+        in_channel (int): Input channel
+        out_channel (int): Output channel
+    """
+
+    def __init__(self, in_channel=1024, out_channel=1024):
         super(RPNFeat, self).__init__()
         # rpn feat is shared with each level
         self.rpn_conv = nn.Conv2D(
-            in_channels=feat_in,
-            out_channels=feat_out,
+            in_channels=in_channel,
+            out_channels=out_channel,
             kernel_size=3,
             padding=1,
             weight_attr=paddle.ParamAttr(initializer=Normal(
@@ -47,6 +55,20 @@ def forward(self, feats):
 
 @register
 class RPNHead(nn.Layer):
+    """
+    Region Proposal Network
+
+    Args:
+        anchor_generator (dict): configure of anchor generation
+        rpn_target_assign (dict): configure of rpn targets assignment
+        train_proposal (dict): configure of proposals generation 
+            at the stage of training
+        test_proposal (dict): configure of proposals generation
+            at the stage of prediction
+        in_channel (int): channel of input feature maps which can be 
+            derived by from_config
+    """
+
     def __init__(self,
                  anchor_generator=AnchorGenerator().__dict__,
                  rpn_target_assign=RPNTargetAssign().__dict__,
diff --git a/ppdet/modeling/proposal_generator/target.py b/ppdet/modeling/proposal_generator/target.py
index b4d490a52c4b5..a783bbdb58586 100644
--- a/ppdet/modeling/proposal_generator/target.py
+++ b/ppdet/modeling/proposal_generator/target.py
@@ -135,12 +135,15 @@ def generate_proposal_target(rpn_rois,
     tgt_gt_inds = []
     new_rois_num = []
 
+    # In cascade rcnn, the threshold for foreground and background
+    # is used from cascade_iou
     fg_thresh = cascade_iou if is_cascade else fg_thresh
     bg_thresh = cascade_iou if is_cascade else bg_thresh
     for i, rpn_roi in enumerate(rpn_rois):
         gt_bbox = gt_boxes[i]
         gt_class = paddle.squeeze(gt_classes[i], axis=-1)
 
+        # Concat RoIs and gt boxes except cascade rcnn
         if not is_cascade:
             bbox = paddle.concat([rpn_roi, gt_bbox])
         else:
@@ -247,10 +250,12 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
     tgt_weights = []
     for k in range(len(rois)):
         labels_per_im = labels_int32[k]
+        # select rois labeled with foreground
         fg_inds = paddle.nonzero(
             paddle.logical_and(labels_per_im != -1, labels_per_im !=
                                num_classes))
         has_fg = True
+        # generate fake roi if foreground is empty
         if fg_inds.numel() == 0:
             has_fg = False
             fg_inds = paddle.ones([1], dtype='int32')
@@ -259,6 +264,8 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
 
         rois_per_im = rois[k]
         fg_rois = paddle.gather(rois_per_im, fg_inds)
+        # Copy the foreground roi to cpu
+        # to generate mask target with ground-truth
         boxes = fg_rois.numpy()
         gt_segms_per_im = gt_segms[k]
         new_segm = []
diff --git a/ppdet/modeling/proposal_generator/target_layer.py b/ppdet/modeling/proposal_generator/target_layer.py
index 1087638b9300b..6ad82dad156a7 100644
--- a/ppdet/modeling/proposal_generator/target_layer.py
+++ b/ppdet/modeling/proposal_generator/target_layer.py
@@ -22,6 +22,32 @@
 @register
 @serializable
 class RPNTargetAssign(object):
+    """
+    RPN targets assignment module
+
+    The assignment consists of three steps:
+        1. Match anchor and ground-truth box, label the anchor with foreground
+           or background sample
+        2. Sample anchors to keep the properly ratio between foreground and 
+           background
+        3. Generate the targets for classification and regression branch
+
+
+    Args:
+        batch_size_per_im (int): Total number of RPN samples per image. 
+            default 256
+        fg_fraction (float): Fraction of anchors that is labeled
+            foreground, default 0.5
+        positive_overlap (float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be 
+            a foreground sample. default 0.7
+        negative_overlap (float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be 
+            a background sample. default 0.3
+        use_random (bool): Use random sampling to choose foreground and 
+            background boxes, default true.
+    """
+
     def __init__(self,
                  batch_size_per_im=256,
                  fg_fraction=0.5,
@@ -54,6 +80,33 @@ def __call__(self, inputs, anchors):
 @register
 class BBoxAssigner(object):
     __shared__ = ['num_classes']
+    """
+    RCNN targets assignment module
+
+    The assignment consists of three steps:
+        1. Match RoIs and ground-truth box, label the RoIs with foreground
+           or background sample
+        2. Sample anchors to keep the properly ratio between foreground and 
+           background
+        3. Generate the targets for classification and regression branch
+
+    Args:
+        batch_size_per_im (int): Total number of RoIs per image. 
+            default 512 
+        fg_fraction (float): Fraction of RoIs that is labeled
+            foreground, default 0.25
+        positive_overlap (float): Minimum overlap required between a RoI
+            and ground-truth box for the (roi, gt box) pair to be 
+            a foreground sample. default 0.5
+        negative_overlap (float): Maximum overlap allowed between a RoI 
+            and ground-truth box for the (roi, gt box) pair to be 
+            a background sample. default 0.5
+        use_random (bool): Use random sampling to choose foreground and 
+            background boxes, default true
+        cascade_iou (list[iou]): The list of overlap to select foreground and 
+            background of each stage, which is only used In Cascade RCNN.
+        num_classes (int): The number of class.
+    """
 
     def __init__(self,
                  batch_size_per_im=512,
@@ -61,7 +114,6 @@ def __init__(self,
                  fg_thresh=.5,
                  bg_thresh=.5,
                  use_random=True,
-                 is_cls_agnostic=False,
                  cascade_iou=[0.5, 0.6, 0.7],
                  num_classes=80):
         super(BBoxAssigner, self).__init__()
@@ -70,7 +122,6 @@ def __init__(self,
         self.fg_thresh = fg_thresh
         self.bg_thresh = bg_thresh
         self.use_random = use_random
-        self.is_cls_agnostic = is_cls_agnostic
         self.cascade_iou = cascade_iou
         self.num_classes = num_classes
 
@@ -99,6 +150,18 @@ def __call__(self,
 @serializable
 class MaskAssigner(object):
     __shared__ = ['num_classes', 'mask_resolution']
+    """
+    Mask targets assignment module
+
+    The assignment consists of three steps:
+        1. Select RoIs labels with foreground.
+        2. Encode the RoIs and corresponding gt polygons to generate 
+           mask target
+
+    Args:
+        num_classes (int): The number of class
+        mask_resolution (int): The resolution of mask target, default 14
+    """
 
     def __init__(self, num_classes=80, mask_resolution=14):
         super(MaskAssigner, self).__init__()