configs/unihead/two-stage-det-unihead-resnet50-coco-1x.yaml

num_classes: &num_classes 81
runtime:
  task_names: det
flip: &flip
  type: flip
  kwargs:
    flip_p: 0.5

resize: &train_resize
  type: keep_ar_resize
  kwargs:
    scales: [800]
    max_size: 1333
    separate_wh: true

test_resize: &test_resize
  type: keep_ar_resize
  kwargs:
    scales: [800]
    max_size: 1333
    separate_wh: true

to_tensor: &to_tensor
  type: to_tensor

normalize: &normalize
  type: normalize
  kwargs:
    mean: [0.485, 0.456, 0.406] # ImageNet pretrained statics
    std: [0.229, 0.224, 0.225]

dataset: # Required.
  train:
    dataset:
      type: coco
      kwargs:
        source: train  # dataset id
        meta_file: data/coco/annotations/instances_train2017.json
        image_reader:
          type: fs_opencv
          kwargs:
            image_dir: data/coco/train2017
            color_mode: RGB
        transformer: [*flip, *train_resize, *to_tensor, *normalize]
  test:
    dataset:
      type: coco
      kwargs:
        source: val
        meta_file: data/coco/annotations/instances_val2017.json
        image_reader:
          type: fs_opencv
          kwargs:
            image_dir: data/coco/val2017
            color_mode: RGB
        transformer: [*test_resize, *to_tensor, *normalize]
        evaluator:
          type: COCO               # choices = {'COCO', 'VOC', 'MR'}
          kwargs:
            gt_file: data/coco/annotations/instances_val2017.json
            iou_types: [bbox]
  batch_sampler:
    type: aspect_ratio_group
    kwargs:
      sampler:
        type: dist
        kwargs: {}
      batch_size: 4
      aspect_grouping: [1]
  dataloader:
    type: base
    kwargs:
      num_workers: 4
      alignment: 32


trainer: # Required.
  max_epoch: 13              # total epochs for the training
  test_freq: 1
  save_freq: 13
  optimizer:                 # optimizer = SGD(params,lr=0.001,momentum=0.9,weight_decay=0.0001)
    type: AdamW
    kwargs:
      lr: 6.25e-06
      betas: [0.9, 0.999]
      weight_decay: 0.05
  lr_scheduler:              # lr_scheduler = MultStepLR(optimizer, milestones=[9,14],gamma=0.1)
    warmup_iter: 500         # set to be 0 to disable warmup. When warmup,  target_lr = init_lr * total_batch_size
    type: MultiStepLR
    kwargs:
      milestones: [9, 12]    # epochs to decay lr
      gamma: 0.1             # decay rate

saver: # Required.
  save_dir: checkpoints    # dir to save checkpoints
  pretrain_model: pretrain/imagenet/resnet50-19c8e357.pth
  results_dir: results_dir  # dir to save detection results. i.e., bboxes, masks, keypoints
  auto_resume: true # find last checkpoint from save_dir and resume from it automatically
                     # this option has the highest priority (auto_resume > opts > resume_model > pretrain_model)


hooks:
  - type: auto_checkpoint
  - type: train_val_logger
    kwargs:
      freq: 50        # 打印 log 频率
      skip_first_k: 5   # 忽略前 5 轮的耗时
      logdir: ./default/log       # tennsorboard 日志路径
      summary_writer: tensorboard

net:
- name: backbone                # backbone = resnet50(frozen_layers, out_layers, out_strides)
  type: resnet50
  kwargs:
    frozen_layers: [0, 1]      # layer0...1 is fixed
    out_layers: [1, 2, 3, 4]    # layer1...4, commonly named Conv2...5
    out_strides: [4, 8, 16, 32] # tell the strides of output features
    normalize:
      type: freeze_bn
    initializer:
      method: msra
- name: neck
  prev: backbone
  type: FPN
  kwargs:
    outplanes: 256
    start_level: 2
    num_level: 5                  # if num_level>len(backbone.out_layers), additional conv with be stacked.
    out_strides: [4, 8, 16, 32, 64] # strides of output features. aka., anchor strides for roi_head
    downsample: pool              # method to downsample, for FPN, it's pool, for RetienaNet, it's conv
    upsample: nearest             # method to interp, nearest or bilinear
    initializer:
      method: xavier
- name: roi_head
  prev: neck
  type: NaiveRPN
  kwargs:
    feat_planes: 256      # channels of intermediate conv
    num_classes: 2        # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81
    num_anchors: 3
    class_activation: sigmoid
    num_level: 5
    initializer:
      method: normal
      std: 0.01
- name: rpn_post_process
  prev: roi_head
  type: rpn_post
  kwargs:
    num_classes: 2    # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81
    cfg:
      cls_loss:
        type: sigmoid_cross_entropy
      loc_loss:
        type: l1_loss
      anchor_generator:
        type: hand_craft
        kwargs:
          anchor_ratios: [0.5, 1, 2]  # anchor strides are provided as feature strides by feature extractor
          anchor_scales: [8]          # scale of anchors relative to feature map
      roi_supervisor:
        type: rpn
        kwargs:
          allowed_border: -1
          matcher:
            type: max_iou
            kwargs:
              positive_iou_thresh: 0.7
              negative_iou_thresh: 0.3
              ignore_iou_thresh: 0.5
              allow_low_quality_match: true
              low_quality_thresh: 0.3    # !this option is not supported yet, but we have future plan
          sampler:
            type: naive_a100   # support A100 training
            kwargs:
              batch_size: 256
              positive_percent: 0.5
      train:
        roi_predictor:
          type: base
          kwargs:
            pre_nms_score_thresh: 0.0
            pre_nms_top_n: 2000
            post_nms_top_n: 1000
            roi_min_size: 0
            nms:
              type: naive
              nms_iou_thresh: 0.7
            merger:
              type: rpn
              kwargs:
                top_n: 1000
      test:
        roi_predictor:
          type: base
          kwargs:
            pre_nms_score_thresh: 0.0
            pre_nms_top_n: 1000
            post_nms_top_n: 1000
            roi_min_size: 0
            nms:
              type: naive
              nms_iou_thresh: 0.7
            merger:
              type: rpn
              kwargs:
                top_n: 1000
- name: bbox_head
  prev: neck
  type: UniBboxFC
  kwargs:
    feat_planes: 1024
    num_classes: *num_classes
    num_point: &num_point 16
    offset_bias_val: 3.0
    cls_block_num: 2
    loc_block_num: 3
    dropout: 0.1
    initializer:
      method: msra
    cfg:
      cls_loss:
        type: softmax_cross_entropy
        kwargs:
          class_dim: -1  # last dim is the class dim
      centerness_loss:
        type: sigmoid_cross_entropy
        kwargs:
          loss_weight: 1.0
      loc_loss_iou:
        type: iou_loss
        kwargs:
          loss_type: giou
          loss_weight: 1.0
      fpn:
        fpn_levels: [0,1,2,3]   # indices of fpn features used for this stage. these levels are supposed to be continuous
        base_scale: 56          # target level of a RoI is floor(log2((w*h)**0.5/base_scale))
      roipooling:
        method: 'roialignpool'      # choices=['roialignpool', 'psroipool', 'roipool', 'psroimaskpool']. note that 'psroipool' is for RFCN head
        pool_size: 7
        sampling_ratio: 0           # number of sampling points in each bin. 0 means densely sampled
      bbox_normalize: &bbox_norm    # not used in UniHead
        means: [0, 0, 0, 0]         
        stds: [0.1, 0.1, 0.2, 0.2]
      share_location: &share_location False   # is share location in bbox regression for all classes
      bbox_supervisor:
        type: uni_bbox_det
        kwargs:
          matcher:
            type: max_iou
            kwargs:
              ignore_iou_thresh: 0.5          # Required if provide ignore_regions
              positive_iou_thresh: 0.5
              negative_iou_thresh: 0.5
              allow_low_quality_match: False  # positive if a anchor has highest iou with any gt
              low_quality_thresh: 0.5
          sampler:
            type: naive_a100
            kwargs:
              batch_size: 512
              positive_percent: 0.25
      bbox_predictor:
        type: uni_bbox_det
        kwargs:
          bbox_normalize: *bbox_norm
          share_location: *share_location
          num_loc_point: *num_point
          nms:
            type: naive
            nms_iou_thresh: 0.5
          bbox_score_thresh: 0.0
          top_n: 100