diff --git a/ABSTRACTIONS.md b/ABSTRACTIONS.md
index 36947bc59..7428de677 100644
--- a/ABSTRACTIONS.md
+++ b/ABSTRACTIONS.md
@@ -31,7 +31,7 @@ a specific image, as well as the size of the image as a `(width, height)` tuple.
 It also contains a set of methods that allow to perform geometric
 transformations to the bounding boxes (such as cropping, scaling and flipping).
 The class accepts bounding boxes from two different input formats:
-- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates)
+- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and
 - `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`.
 
 Additionally, each `BoxList` instance can also hold arbitrary additional information
@@ -39,7 +39,7 @@ for each bounding box, such as labels, visibility, probability scores etc.
 
 Here is an example on how to create a `BoxList` from a list of coordinates:
 ```python
-from maskrcnn_baseline.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT
+from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT
 
 width = 100
 height = 200
@@ -49,7 +49,7 @@ boxes = [
   [10, 10, 50, 50]
 ]
 # create a BoxList with 3 boxes
-bbox = BoxList(boxes, size=(width, height), mode='xyxy')
+bbox = BoxList(boxes, image_size=(width, height), mode='xyxy')
 
 # perform some box transformations, has similar API as PIL.Image
 bbox_scaled = bbox.resize((width * 2, height * 3))
diff --git a/configs/retina/retinanet_R-101-FPN_1x.yaml b/configs/retina/retinanet_R-101-FPN_1x.yaml
new file mode 100644
index 000000000..b2f78d22f
--- /dev/null
+++ b/configs/retina/retinanet_R-101-FPN_1x.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800, )
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+
+
diff --git a/configs/retina/retinanet_R-50-FPN_1x.yaml b/configs/retina/retinanet_R-50-FPN_1x.yaml
new file mode 100644
index 000000000..b851a6c99
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x.yaml
@@ -0,0 +1,46 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
diff --git a/configs/retina/retinanet_R-50-FPN_1x_adjust_std011.yaml b/configs/retina/retinanet_R-50-FPN_1x_adjust_std011.yaml
new file mode 100644
index 000000000..edfb95ec8
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_adjust_std011.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_R-50-FPN_1x_adjust_std100.yaml b/configs/retina/retinanet_R-50-FPN_1x_adjust_std100.yaml
new file mode 100644
index 000000000..c591e9ceb
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_adjust_std100.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  BBOX_REG_BETA: 1.0
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_R-50-FPN_1x_adjustl1.yaml b/configs/retina/retinanet_R-50-FPN_1x_adjustl1.yaml
new file mode 100644
index 000000000..edfb95ec8
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_adjustl1.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_R-50-FPN_1x_beta100.yaml b/configs/retina/retinanet_R-50-FPN_1x_beta100.yaml
new file mode 100644
index 000000000..e61c0ef07
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_beta100.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  BBOX_REG_BETA: 1.0
+  SELFADJUST_SMOOTH_L1: False
diff --git a/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.2.yaml b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.2.yaml
new file mode 100644
index 000000000..e30ac293e
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.2.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  LOW_QUALITY_THRESHOLD: 0.4
diff --git a/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.3.yaml b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.3.yaml
new file mode 100644
index 000000000..e30ac293e
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.3.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  LOW_QUALITY_THRESHOLD: 0.4
diff --git a/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.4.yaml b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.4.yaml
new file mode 100644
index 000000000..e30ac293e
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_low_quality_0.4.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  LOW_QUALITY_THRESHOLD: 0.4
diff --git a/configs/retina/retinanet_R-50-FPN_1x_no_low_quality.yaml b/configs/retina/retinanet_R-50-FPN_1x_no_low_quality.yaml
new file mode 100644
index 000000000..4ad4a2c8d
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_no_low_quality.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  LOW_QUALITY_MATCHES: False
diff --git a/configs/retina/retinanet_R-50-FPN_1x_no_low_quality_adjustl1.yaml b/configs/retina/retinanet_R-50-FPN_1x_no_low_quality_adjustl1.yaml
new file mode 100644
index 000000000..d6315e31e
--- /dev/null
+++ b/configs/retina/retinanet_R-50-FPN_1x_no_low_quality_adjustl1.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.01
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+RETINANET:
+  RETINANET_ON: True
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  LOW_QUALITY_MATCHES: False
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_400.yaml b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_400.yaml
new file mode 100644
index 000000000..37fae57e8
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_400.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (400,)
+  MAX_SIZE_TRAIN: 667
+  MIN_SIZE_TEST: 400
+  MAX_SIZE_TEST: 667
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_500.yaml b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_500.yaml
new file mode 100644
index 000000000..a34b4cd40
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_500.yaml
@@ -0,0 +1,61 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+  #TEST: ("coco_test-dev",)
+INPUT:
+  MIN_SIZE_TRAIN: (500,)
+  MAX_SIZE_TRAIN: 833
+  MIN_SIZE_TEST: 500
+  MAX_SIZE_TEST: 833
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_600.yaml b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_600.yaml
new file mode 100644
index 000000000..f43959133
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_600.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 600
+  MAX_SIZE_TEST: 1000
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_700.yaml b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_700.yaml
new file mode 100644
index 000000000..98dd797d1
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_700.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (700,)
+  MAX_SIZE_TRAIN: 1167
+  MIN_SIZE_TEST: 700
+  MAX_SIZE_TEST: 1167
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.0025
+  WEIGHT_DECAY: 0.0001
+  STEPS: (360000, 480000)
+  MAX_ITER: 540000
+  IMS_PER_BATCH: 4
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_800.yaml b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_800.yaml
new file mode 100644
index 000000000..f582b97f4
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_1.5x_adjust_std011_800.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.0025
+  WEIGHT_DECAY: 0.0001
+  STEPS: (360000, 480000)
+  MAX_ITER: 540000
+  IMS_PER_BATCH: 4
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms.yaml b/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms.yaml
new file mode 100644
index 000000000..3efd23feb
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 800, 1000)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.0025
+  WEIGHT_DECAY: 0.0001
+  STEPS: (480000, 640000)
+  MAX_ITER: 720000
+  IMS_PER_BATCH: 4
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms_gn.yaml b/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms_gn.yaml
new file mode 100644
index 000000000..1d7374b05
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-101-FPN_2x_adjust_std011_ms_gn.yaml
@@ -0,0 +1,61 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+  USE_GN: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 800, 1000)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (240000, 320000)
+  MAX_ITER: 360000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1.5x.yaml b/configs/retina/retinanet_mask_R-50-FPN_1.5x.yaml
new file mode 100644
index 000000000..e3ffb2543
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1.5x.yaml
@@ -0,0 +1,58 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_400.yaml b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_400.yaml
new file mode 100644
index 000000000..1a6f71002
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_400.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (400,)
+  MAX_SIZE_TRAIN: 667
+  MIN_SIZE_TEST: 400
+  MAX_SIZE_TEST: 667
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_500.yaml b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_500.yaml
new file mode 100644
index 000000000..1a6f71002
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_500.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (400,)
+  MAX_SIZE_TRAIN: 667
+  MIN_SIZE_TEST: 400
+  MAX_SIZE_TEST: 667
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_600.yaml b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_600.yaml
new file mode 100644
index 000000000..e2eae3577
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_600.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 600
+  MAX_SIZE_TEST: 1000
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_800.yaml b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_800.yaml
new file mode 100644
index 000000000..635f2c904
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1.5x_adjust_std011_800.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_1x.yaml b/configs/retina/retinanet_mask_R-50-FPN_1x.yaml
new file mode 100644
index 000000000..bb60b73ac
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_1x.yaml
@@ -0,0 +1,58 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/configs/retina/retinanet_mask_R-50-FPN_2x_adjust_std011_ms.yaml b/configs/retina/retinanet_mask_R-50-FPN_2x_adjust_std011_ms.yaml
new file mode 100644
index 000000000..efbaea082
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_2x_adjust_std011_ms.yaml
@@ -0,0 +1,60 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 800, 1000)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (240000, 320000)
+  MAX_ITER: 360000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_R-50-FPN_canonical5_1.5x.yaml b/configs/retina/retinanet_mask_R-50-FPN_canonical5_1.5x.yaml
new file mode 100644
index 000000000..03715a62e
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_canonical5_1.5x.yaml
@@ -0,0 +1,59 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+    CANONICAL_LEVEL: 5
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (180000, 240000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/configs/retina/retinanet_mask_R-50-FPN_canonical5_1x.yaml b/configs/retina/retinanet_mask_R-50-FPN_canonical5_1x.yaml
new file mode 100644
index 000000000..80edcbfcb
--- /dev/null
+++ b/configs/retina/retinanet_mask_R-50-FPN_canonical5_1x.yaml
@@ -0,0 +1,59 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125, 0.015625)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+    CANONICAL_LEVEL: 5
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/configs/retina/retinanet_mask_X-101-32x8d_FPN_2x_adjust_std011_ms_gn.yaml b/configs/retina/retinanet_mask_X-101-32x8d_FPN_2x_adjust_std011_ms_gn.yaml
new file mode 100644
index 000000000..996fe4ebc
--- /dev/null
+++ b/configs/retina/retinanet_mask_X-101-32x8d_FPN_2x_adjust_std011_ms_gn.yaml
@@ -0,0 +1,66 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/FAIR/20171220/X-101-32x8d"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-101-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  RESNETS:
+    STRIDE_IN_1X1: False
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+  MASK_ON: True
+  USE_GN: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+  #TEST: ("coco_test-dev",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800, 840, 880, 920, 960, 1000)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.0025
+  WEIGHT_DECAY: 0.0001
+  STEPS: (480000, 640000)
+  MAX_ITER: 720000
+  IMS_PER_BATCH: 4
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
+  BBOX_REG_BETA: 0.11
+  SELFADJUST_SMOOTH_L1: True
diff --git a/configs/retina/retinanet_mask_p2p7_R-50-FPN_1x.yaml b/configs/retina/retinanet_mask_p2p7_R-50-FPN_1x.yaml
new file mode 100644
index 000000000..91742b719
--- /dev/null
+++ b/configs/retina/retinanet_mask_p2p7_R-50-FPN_1x.yaml
@@ -0,0 +1,57 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p2p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/configs/retina/retinanet_sparsemask_R-50-FPN_1x.yaml b/configs/retina/retinanet_sparsemask_R-50-FPN_1x.yaml
new file mode 100644
index 000000000..824adb64d
--- /dev/null
+++ b/configs/retina/retinanet_sparsemask_R-50-FPN_1x.yaml
@@ -0,0 +1,59 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  RPN_ONLY: True
+  MASK_ON: False
+  SPARSE_MASK_ON: True
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    FG_IOU_THRESHOLD: 0.5
+    BG_IOU_THRESHOLD: 0.4
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.125, 0.0625, 0.03125)
+    #POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 4 gpus
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 8
+RETINANET:
+  RETINANET_ON: True
+  BACKBONE: "p3p7"
+  SCALES_PER_OCTAVE: 3
+  STRADDLE_THRESH: -1
+  NUM_MASKS_TEST: 50
diff --git a/demo/Mask_R-CNN_demo.ipynb b/demo/Mask_R-CNN_demo.ipynb
index 0d975eab7..ff1906630 100644
--- a/demo/Mask_R-CNN_demo.ipynb
+++ b/demo/Mask_R-CNN_demo.ipynb
@@ -203,7 +203,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
index 12b8eb5d6..ae7d9f544 100644
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -19,13 +19,14 @@
 # -----------------------------------------------------------------------------
 
 _C = CN()
-
+_C.DEBUG = False
 _C.MODEL = CN()
 _C.MODEL.RPN_ONLY = False
 _C.MODEL.MASK_ON = False
+_C.MODEL.SPARSE_MASK_ON = False
 _C.MODEL.DEVICE = "cuda"
 _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
-
+_C.MODEL.USE_GN = False
 # If the WEIGHT starts with a catalog://, like :R-50, the code will look for
 # the path in paths_catalog. Else, it will use it as the specified absolute
 # path
@@ -37,7 +38,7 @@
 # -----------------------------------------------------------------------------
 _C.INPUT = CN()
 # Size of the smallest side of the image during training
-_C.INPUT.MIN_SIZE_TRAIN = 800  # (800,)
+_C.INPUT.MIN_SIZE_TRAIN = (800,) # 800
 # Maximum size of the side of the image during training
 _C.INPUT.MAX_SIZE_TRAIN = 1333
 # Size of the smallest side of the image during testing
@@ -192,7 +193,7 @@
 _C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
 _C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
 _C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
-
+_C.MODEL.ROI_MASK_HEAD.CANONICAL_LEVEL = 4
 # ---------------------------------------------------------------------------- #
 # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
 # Note that parts of a resnet may be used for both the backbone and the head
@@ -221,6 +222,96 @@
 _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
 _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
 
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.RETINANET = CN()
+
+# RetinaNet is used (instead of Fast/er/Mask R-CNN/R-FCN/RPN) if True
+_C.RETINANET.RETINANET_ON = False
+
+# This is the number of foreground classes, background is not included.
+_C.RETINANET.NUM_CLASSES = 81
+
+# Anchor aspect ratios to use
+_C.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+_C.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+_C.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128)
+_C.RETINANET.STRADDLE_THRESH = 0
+
+# Anchor scales per octave
+_C.RETINANET.OCTAVE = 2.0
+_C.RETINANET.SCALES_PER_OCTAVE = 3
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.RETINANET.NUM_CONVS = 4
+
+# Weight for bbox_regression loss
+_C.RETINANET.BBOX_REG_WEIGHT = 1.0
+
+# Smooth L1 loss beta for bbox regression
+_C.RETINANET.BBOX_REG_BETA = 0.11
+
+# Use Self-Adjust Smooth L1 Loss
+_C.RETINANET.SELFADJUST_SMOOTH_L1 = False
+
+# During inference, #locs to select based on cls score before NMS is performed
+# per FPN level
+_C.RETINANET.PRE_NMS_TOP_N = 1000
+
+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+_C.RETINANET.POSITIVE_OVERLAP = 0.5
+
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+_C.RETINANET.NEGATIVE_OVERLAP = 0.4
+
+# Focal loss parameter: alpha
+_C.RETINANET.LOSS_ALPHA = 0.25
+
+# Focal loss parameter: gamma
+_C.RETINANET.LOSS_GAMMA = 2.0
+
+# Prior prob for the positives at the beginning of training. This is used to set
+# the bias init for the logits layer
+_C.RETINANET.PRIOR_PROB = 0.01
+
+# Whether classification and bbox branch tower should be shared or not
+_C.RETINANET.SHARE_CLS_BBOX_TOWER = False
+
+# Use class specific bounding box regression instead of the default class
+# agnostic regression
+_C.RETINANET.CLASS_SPECIFIC_BBOX = False
+
+# Whether softmax should be used in classification branch training
+_C.RETINANET.SOFTMAX = False
+
+# Inference cls score threshold, anchors with score > INFERENCE_TH are
+# considered for inference
+_C.RETINANET.INFERENCE_TH = 0.05
+
+# "p3p7": Use feature p3p7 for object detection and p3-p5 for mask prediction.
+# "p2p7": Use feature p3p7 for object detection and p2-p5 for mask prediction.
+_C.RETINANET.BACKBONE = "p3p7"
+
+_C.RETINANET.NUM_MASKS_TEST = 50
+
+_C.RETINANET.LOW_QUALITY_MATCHES = True
+_C.RETINANET.LOW_QUALITY_THRESHOLD = 0.0
+
+# ---------------------------------------------------------------------------- #
+# SparseMask Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SPARSE_MASK_HEAD = CN()
+_C.MODEL.SPARSE_MASK_HEAD.PREDICTOR = ""
+_C.MODEL.SPARSE_MASK_HEAD.FEATURE_EXTRACTOR = "SparseMaskFPNFeatureExtractor"
+_C.MODEL.SPARSE_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
+_C.MODEL.SPARSE_MASK_HEAD.RESOLUTION = 14
+
+
 # ---------------------------------------------------------------------------- #
 # Solver
 # ---------------------------------------------------------------------------- #
@@ -261,7 +352,7 @@
 _C.TEST.IMS_PER_BATCH = 8
 
 
-# ---------------------------------------------------------------------------- #
+_C.TEST.DETECTIONS_PER_IMG =100
 # Misc options
 # ---------------------------------------------------------------------------- #
 _C.OUTPUT_DIR = "."
diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py
index 67231baef..f712586f3 100644
--- a/maskrcnn_benchmark/config/paths_catalog.py
+++ b/maskrcnn_benchmark/config/paths_catalog.py
@@ -8,6 +8,22 @@ class DatasetCatalog(object):
     DATA_DIR = "datasets"
 
     DATASETS = {
+        "coco_test-dev": (
+            "coco/test2017",
+            "coco/annotations/image_info_test-dev2017.json",
+        ),
+        "coco_2017_test": (
+            "coco/test2017",
+            "coco/annotations/image_info_test2017.json",
+        ),
+        "coco_2017_train": (
+            "coco/train2017",
+            "coco/annotations/instances_train2017.json",
+        ),
+        "coco_2017_val": (
+            "coco/val2017",
+            "coco/annotations/instances_val2017.json",
+        ),
         "coco_2014_train": (
             "coco/train2014",
             "coco/annotations/instances_train2014.json",
diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
new file mode 100644
index 000000000..308861e44
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor SigmoidFocalLoss_forward(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  if (logits.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor SigmoidFocalLoss_backward(
+			     const at::Tensor& logits,
+                             const at::Tensor& targets,
+			     const at::Tensor& d_losses,
+			     const int num_classes,
+			     const float gamma,
+			     const float alpha) {
+  if (logits.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/SparseSelect.h b/maskrcnn_benchmark/csrc/SparseSelect.h
new file mode 100644
index 000000000..fe2d6962b
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/SparseSelect.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor SparseSelect_forward(
+		const at::Tensor& features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size) {
+  if (features.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SparseSelect_forward_cuda(features, batches, offsets, kernel_size);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return SparseSelect_forward_cpu(features, batches, offsets, kernel_size);
+}
+
+int SparseSelect_backward(
+		at::Tensor& d_features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size, 
+		const at::Tensor& d_outputs) {
+  if (d_outputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    SparseSelect_backward_cuda(d_features, batches, offsets, kernel_size, d_outputs);
+    return 0;
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/SparseSelect_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/SparseSelect_cpu.cpp
new file mode 100644
index 000000000..542e92970
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/SparseSelect_cpu.cpp
@@ -0,0 +1,90 @@
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include "cpu/vision.h"
+
+template <typename T>
+void SparseSelectForward_cpu_kernel(
+    const int nthreads, 
+    const T* features,
+    const int* batches,
+    const int* offsets,
+    const int batch, 
+    const int depth,
+    const int height,
+    const int width,
+    const int kernel_size,
+    const int kernel_half,
+    const int num,
+    T*  outputs) {
+    
+    int n_masks = nthreads / (kernel_size * kernel_size * depth);
+
+
+     for (int mask_idx=0; mask_idx < n_masks; mask_idx++) {
+	for (int dim_idx =0; dim_idx < depth; dim_idx++) {
+           for (int h_offset=0; h_offset < kernel_size; h_offset++) {
+              for (int w_offset=0; w_offset < kernel_size; w_offset++) {
+
+	         int batch_idx = batches[mask_idx];
+	         int h_start = (offsets[mask_idx] / width) - kernel_half;
+	         int w_start = (offsets[mask_idx] % width) - kernel_half;
+
+	         int feature_h_idx = h_start + h_offset;
+	         int feature_w_idx = w_start + w_offset;
+	         int output_index = mask_idx * depth * kernel_size * kernel_size +
+	    	     dim_idx * kernel_size * kernel_size +
+	    	     h_offset * kernel_size + w_offset;
+
+
+	         if ((feature_h_idx <0) || (feature_h_idx >= height) ||
+	    	 (feature_w_idx <0) || (feature_w_idx >= width)) {
+	         	outputs[output_index] = 0;
+	         }else{
+	        	long feature_index = ((batch_idx * depth + dim_idx) * height +
+	    			     feature_h_idx) * width + feature_w_idx;
+	    	        outputs[output_index] = features[feature_index];
+	        }
+
+	      }
+	   }
+	}
+     }
+} // SigmoidFocalLossForward
+
+
+at::Tensor SparseSelect_forward_cpu(
+		const at::Tensor& features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size) { 
+  AT_ASSERTM(!features.type().is_cuda(), "features must be a CPU tensor");
+  AT_ASSERTM(!batches.type().is_cuda(), "batches must be a CPU tensor");
+
+  const int batch = features.size(0);
+  const int depth = features.size(1);
+  const int height = features.size(2);
+  const int width = features.size(3);
+  const int num_samples = batches.size(0);
+  const int kernel_half = (kernel_size - 1) / 2;
+	
+  auto output = at::empty({num_samples, depth, kernel_size, kernel_size}, features.options());
+  auto output_size = num_samples * features.size(1) * kernel_size * kernel_size;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(features.type(), "SparseSelect_forward", [&] {
+    SparseSelectForward_cpu_kernel<scalar_t>(
+         output_size,
+         features.contiguous().data<scalar_t>(),
+	 batches.contiguous().data<int>(),
+	 offsets.contiguous().data<int>(),
+         batch, depth, height, width, kernel_size, kernel_half,
+	 num_samples,
+         output.data<scalar_t>());
+  });
+  return output;   
+}	
+
+
diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h
index 926112536..aa0b13820 100644
--- a/maskrcnn_benchmark/csrc/cpu/vision.h
+++ b/maskrcnn_benchmark/csrc/cpu/vision.h
@@ -3,6 +3,13 @@
 #include <torch/extension.h>
 
 
+at::Tensor SparseSelect_forward_cpu(
+		const at::Tensor& features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size);
+
+
 at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
                                 const at::Tensor& rois,
                                 const float spatial_scale,
diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
new file mode 100644
index 000000000..7d40767bb
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This file is modified from  https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <cfloat>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+
+template <typename T>
+__global__ void SigmoidFocalLossForward(const int nthreads, 
+    const T* logits,
+    const int* targets,
+    const int num_classes,
+    const float gamma, 
+    const float alpha,
+    const int num, 
+    T* losses) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79]; 
+    int t = targets[n]; // target class [1~80];
+
+    // Decide it is positive or negative case. 
+    T c1 = (t == (d+1)); 
+    T c2 = (t>=0 & t != (d+1));
+
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+
+    // (1-p)**gamma * log(p) where
+    T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+
+    // p**gamma * log(1-p)
+    T term2 = powf(p, gamma) *
+            (-1. * logits[i] * (logits[i] >= 0) -   
+             logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+
+    losses[i] = 0.0;
+    losses[i] += -c1 * term1 * zp;
+    losses[i] += -c2 * term2 * zn;
+
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+template <typename T>
+__global__ void SigmoidFocalLossBackward(const int nthreads,
+                const T* logits,
+                const int* targets,
+                const T* d_losses,
+                const int num_classes,
+                const float gamma,
+                const float alpha,
+                const int num,
+                T* d_logits) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79]; 
+    int t = targets[n]; // target class [1~80], 0 is background;
+
+    // Decide it is positive or negative case. 
+    T c1 = (t == (d+1));
+    T c2 = (t>=0 & t != (d+1));
+
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+
+    // (1-p)**g * (1 - p - g*p*log(p)
+    T term1 = powf((1. - p), gamma) *
+                      (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+
+    // (p**g) * (g*(1-p)*log(1-p) - p)
+    T term2 = powf(p, gamma) *
+                  ((-1. * logits[i] * (logits[i] >= 0) -
+                      logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+                      (1. - p) * gamma - p);
+    d_logits[i] = 0.0;
+    d_logits[i] += -c1 * term1 * zp;
+    d_logits[i] += -c2 * term2 * zn;
+    d_logits[i] = d_logits[i] * d_losses[i];
+
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossBackward
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+  const int num_samples = logits.size(0);
+	
+  auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
+  auto losses_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (losses.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return losses;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
+    SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
+         losses_size,
+         logits.contiguous().data<scalar_t>(),
+	 targets.contiguous().data<int>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         losses.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return losses;   
+}	
+
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const at::Tensor& d_losses,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
+
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+  const int num_samples = logits.size(0);
+  AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
+	
+  auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
+  auto d_logits_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (d_logits.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return d_logits;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
+    SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
+         d_logits_size,
+         logits.contiguous().data<scalar_t>(),
+	 targets.contiguous().data<int>(),
+	 d_losses.contiguous().data<scalar_t>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         d_logits.data<scalar_t>());
+  });
+
+  THCudaCheck(cudaGetLastError());
+  return d_logits;   
+}	
+
diff --git a/maskrcnn_benchmark/csrc/cuda/SparseSelect_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SparseSelect_cuda.cu
new file mode 100644
index 000000000..0beee17fa
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/SparseSelect_cuda.cu
@@ -0,0 +1,184 @@
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <cfloat>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+
+template <typename T>
+__global__ void SparseSelectForward(const int nthreads, 
+    const T* features,
+    const int* batches,
+    const int* offsets,
+    const int batch, 
+    const int depth,
+    const int height,
+    const int width,
+    const int kernel_size,
+    const int kernel_half,
+    const int num,
+    T*  outputs) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    
+     int w_offset = i % kernel_size;
+     int h_offset = (i / kernel_size) % kernel_size;
+     int dim_idx = (i / (kernel_size  * kernel_size)) % depth;
+     int mask_idx = i / (kernel_size * kernel_size * depth);
+
+     int batch_idx = batches[mask_idx];
+     int h_start = (offsets[mask_idx] / width) - kernel_half;
+     int w_start = (offsets[mask_idx] % width) - kernel_half;
+
+     int feature_h_idx = h_start + h_offset;
+     int feature_w_idx = w_start + w_offset;
+     int output_index = mask_idx * depth * kernel_size * kernel_size +
+	     dim_idx * kernel_size * kernel_size +
+	     h_offset * kernel_size + w_offset;
+
+
+     if ((feature_h_idx <0) || (feature_h_idx >= height) ||
+         (feature_w_idx <0) || (feature_w_idx >= width)) {
+	outputs[output_index] = 0;
+     }else{
+        long feature_index = ((batch_idx * depth + dim_idx) * height +
+                             feature_h_idx) * width + feature_w_idx;
+        outputs[output_index] = features[feature_index];
+    }
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+template <typename T>
+__global__ void SparseSelectBackward(const int nthreads, 
+    T* d_features,
+    const int* batches,
+    const int* offsets,
+    const int batch, 
+    const int depth,
+    const int height,
+    const int width,
+    const int kernel_size,
+    const int kernel_half,
+    const int num,
+    const T*  d_outputs) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    
+     int w_offset = i % kernel_size;
+     int h_offset = (i / kernel_size) % kernel_size;
+     int dim_idx = (i / (kernel_size  * kernel_size)) % depth;
+     int mask_idx = i / (kernel_size * kernel_size * depth);
+
+     int batch_idx = batches[mask_idx];
+     int h_start = (offsets[mask_idx] / width) - kernel_half;
+     int w_start = (offsets[mask_idx] % width) - kernel_half;
+
+     int feature_h_idx = h_start + h_offset;
+     int feature_w_idx = w_start + w_offset;
+     int output_index = mask_idx * depth * kernel_size * kernel_size +
+	     dim_idx * kernel_size * kernel_size +
+	     h_offset * kernel_size + w_offset;
+
+
+     if ((feature_h_idx <0) || (feature_h_idx >= height) ||
+         (feature_w_idx <0) || (feature_w_idx >= width)) {
+	//do nothing
+     }else{
+        long feature_index = ((batch_idx * depth + dim_idx) * height +
+                             feature_h_idx) * width + feature_w_idx;
+	atomicAdd(d_features + feature_index, d_outputs[output_index]);
+    }
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+at::Tensor SparseSelect_forward_cuda(
+		const at::Tensor& features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size) { 
+  AT_ASSERTM(features.type().is_cuda(), "features must be a CUDA tensor");
+  AT_ASSERTM(batches.type().is_cuda(), "batches must be a CUDA tensor");
+
+  const int batch = features.size(0);
+  const int depth = features.size(1);
+  const int height = features.size(2);
+  const int width = features.size(3);
+  const int num_samples = batches.size(0);
+  const int kernel_half = (kernel_size - 1) / 2;
+	
+  auto output = at::empty({num_samples, depth, kernel_size, kernel_size}, features.options());
+  auto output_size = num_samples * features.size(1) * kernel_size * kernel_size;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(features.type(), "SparseSelect_forward", [&] {
+    SparseSelectForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         features.contiguous().data<scalar_t>(),
+	 batches.contiguous().data<int>(),
+	 offsets.contiguous().data<int>(),
+         batch, depth, height, width, kernel_size, kernel_half,
+	 num_samples,
+         output.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return output;   
+}	
+
+
+void SparseSelect_backward_cuda(
+		at::Tensor& d_features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size, 
+		const at::Tensor& d_outputs) {
+
+  AT_ASSERTM(d_outputs.type().is_cuda(), "d_outputs must be a CUDA tensor");
+  AT_ASSERTM(d_features.type().is_cuda(), "d_features must be a CUDA tensor");
+  AT_ASSERTM(batches.type().is_cuda(), "batches must be a CUDA tensor");
+
+  const int batch = d_features.size(0);
+  const int depth = d_features.size(1);
+  const int height = d_features.size(2);
+  const int width = d_features.size(3);
+  const int num_samples = batches.size(0);
+  const int kernel_half = (kernel_size - 1) / 2;
+ 
+  d_features.zero_();
+  auto output_size = num_samples * d_features.size(1) * kernel_size * kernel_size;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+  dim3 block(512);
+
+  AT_DISPATCH_FLOATING_TYPES(d_features.type(), "SparseSelect_backward", [&] {
+    SparseSelectForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         d_features.contiguous().data<scalar_t>(),
+	 batches.contiguous().data<int>(),
+	 offsets.contiguous().data<int>(),
+         batch, depth, height, width, kernel_size, kernel_half,
+	 num_samples,
+         d_outputs.contiguous().data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+}
+
+
diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h
index 977cef7b5..a7a03ebfe 100644
--- a/maskrcnn_benchmark/csrc/cuda/vision.h
+++ b/maskrcnn_benchmark/csrc/cuda/vision.h
@@ -3,6 +3,34 @@
 #include <torch/extension.h>
 
 
+at::Tensor SparseSelect_forward_cuda(
+		const at::Tensor& features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size);
+
+void SparseSelect_backward_cuda(
+		at::Tensor& d_features,
+                const at::Tensor& batches,
+		const at::Tensor& offsets,
+		const int kernel_size, 
+		const at::Tensor& d_outputs);
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha); 
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+			     const at::Tensor& logits,
+                             const at::Tensor& targets,
+			     const at::Tensor& d_losses,
+			     const int num_classes,
+			     const float gamma,
+			     const float alpha);
+
 at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
                                  const at::Tensor& rois,
                                  const float spatial_scale,
diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp
index ff002584c..2baa1baa3 100644
--- a/maskrcnn_benchmark/csrc/vision.cpp
+++ b/maskrcnn_benchmark/csrc/vision.cpp
@@ -2,7 +2,8 @@
 #include "nms.h"
 #include "ROIAlign.h"
 #include "ROIPool.h"
-
+#include "SigmoidFocalLoss.h"
+#include "SparseSelect.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("nms", &nms, "non-maximum suppression");
@@ -10,4 +11,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
   m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
   m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
+  m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
+  m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
+  m.def("sparse_select_forward", &SparseSelect_forward, "SparseSelect_forward");
+  m.def("sparse_select_backward", &SparseSelect_backward, "SparseSelect_backward");
 }
diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py
index 8645d4df4..1285f8419 100644
--- a/maskrcnn_benchmark/data/transforms/build.py
+++ b/maskrcnn_benchmark/data/transforms/build.py
@@ -7,10 +7,12 @@ def build_transforms(cfg, is_train=True):
         min_size = cfg.INPUT.MIN_SIZE_TRAIN
         max_size = cfg.INPUT.MAX_SIZE_TRAIN
         flip_prob = 0.5  # cfg.INPUT.FLIP_PROB_TRAIN
+        resize = T.MultiScaleResize(min_size, max_size)
     else:
         min_size = cfg.INPUT.MIN_SIZE_TEST
         max_size = cfg.INPUT.MAX_SIZE_TEST
         flip_prob = 0
+        resize = T.Resize(min_size, max_size)
 
     to_bgr255 = cfg.INPUT.TO_BGR255
     normalize_transform = T.Normalize(
@@ -19,7 +21,7 @@ def build_transforms(cfg, is_train=True):
 
     transform = T.Compose(
         [
-            T.Resize(min_size, max_size),
+            resize,
             T.RandomHorizontalFlip(flip_prob),
             T.ToTensor(),
             normalize_transform,
diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py
index 71d48d295..4b8ab1d45 100644
--- a/maskrcnn_benchmark/data/transforms/transforms.py
+++ b/maskrcnn_benchmark/data/transforms/transforms.py
@@ -59,6 +59,19 @@ def __call__(self, image, target):
         return image, target
 
 
+class MultiScaleResize(object):
+    def __init__(self, min_sizes, max_size):
+        self.resizers = []
+        for min_size in min_sizes:
+            self.resizers.append(Resize(min_size, max_size))
+
+    def __call__(self, image, target):
+        resizer = random.choice(self.resizers)
+        image, target = resizer(image, target)
+
+        return image, target
+
+
 class RandomHorizontalFlip(object):
     def __init__(self, prob=0.5):
         self.prob = prob
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
index af8049303..7e8d05ef4 100644
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -6,7 +6,7 @@
 import torch
 from torch.distributed import deprecated as dist
 
-from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.utils.comm import get_world_size, get_rank
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 
 
@@ -82,7 +82,8 @@ def do_train(
         eta_seconds = meters.time.global_avg * (max_iter - iteration)
         eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
 
-        if iteration % 20 == 0 or iteration == (max_iter - 1):
+        #if iteration % 20 == 0 or iteration == (max_iter - 1):
+        if True:
             logger.info(
                 meters.delimiter.join(
                     [
@@ -101,7 +102,7 @@ def do_train(
                 )
             )
         if iteration % checkpoint_period == 0 and iteration > 0:
-            checkpointer.save("model_{:07d}".format(iteration), **arguments)
+            checkpointer.save("model_{:07d}".format(iteration+1), **arguments)
 
     checkpointer.save("model_{:07d}".format(iteration), **arguments)
     total_training_time = time.time() - start_training_time
diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py
index 0b7f77c8b..cf89b2af4 100644
--- a/maskrcnn_benchmark/layers/__init__.py
+++ b/maskrcnn_benchmark/layers/__init__.py
@@ -10,6 +10,12 @@
 from .roi_align import roi_align
 from .roi_pool import ROIPool
 from .roi_pool import roi_pool
-from .smooth_l1_loss import smooth_l1_loss
+from .smooth_l1_loss import smooth_l1_loss, SmoothL1Loss
+from .sigmoid_focal_loss import SigmoidFocalLoss
+from .sparse_select import SparseSelect
+from .adjust_smooth_l1_loss import AdjustSmoothL1Loss
 
-__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "FrozenBatchNorm2d"]
+__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool",
+           "smooth_l1_loss", "SmoothL1Loss", "Conv2d", "ConvTranspose2d",
+           "interpolate", "FrozenBatchNorm2d", "SigmoidFocalLoss",
+           "SparseSelect", "AdjustSmoothL1Loss"]
diff --git a/maskrcnn_benchmark/layers/adjust_smooth_l1_loss.py b/maskrcnn_benchmark/layers/adjust_smooth_l1_loss.py
new file mode 100644
index 000000000..297cadb2a
--- /dev/null
+++ b/maskrcnn_benchmark/layers/adjust_smooth_l1_loss.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+import logging
+from torch.distributed import deprecated as dist
+
+class AdjustSmoothL1Loss(nn.Module):
+
+    def __init__(self, num_features, momentum=0.1, beta=1. /9):
+        super(AdjustSmoothL1Loss, self).__init__()
+        self.num_features = num_features
+        self.momentum = momentum
+        self.beta = beta
+        self.register_buffer(
+            'running_mean', torch.empty(num_features).fill_(beta)
+        )
+        self.register_buffer('running_var', torch.zeros(num_features))
+        self.logger = logging.getLogger("maskrcnn_benchmark.trainer")
+
+    def forward(self, inputs, target, size_average=True):
+
+        n = torch.abs(inputs -target)
+        with torch.no_grad():
+            if torch.isnan(n.var(dim=0)).sum().item() == 0:
+                self.running_mean = self.running_mean.to(n.device)
+                self.running_mean *= (1 - self.momentum)
+                self.running_mean += (self.momentum * n.mean(dim=0))
+                self.running_var = self.running_var.to(n.device)
+                self.running_var *= (1 - self.momentum)
+                self.running_var += (self.momentum * n.var(dim=0))
+
+
+        beta = (self.running_mean - self.running_var)
+
+        self.logger.info('AdjustSmoothL1(mean): {:.3}, {:.3}, {:.3}, {:.3}'.format(
+            self.running_mean[0].item(),
+            self.running_mean[1].item(),
+            self.running_mean[2].item(),
+            self.running_mean[3].item()
+        ))
+        self.logger.info('AdjustSmoothL1(var): {:.3}, {:.3}, {:.3}, {:.3}'.format(
+            self.running_var[0].item(),
+            self.running_var[1].item(),
+            self.running_var[2].item(),
+            self.running_var[3].item()
+        ))
+        beta = beta.clamp(max=self.beta, min=1e-3)
+
+        #beta = (self.running_mean - self.running_var).clamp(
+        #    max=self.beta, min=1e-3)
+
+        beta = beta.view(-1, self.num_features).to(n.device)
+        cond = n < beta.expand_as(n)
+        loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+        if size_average:
+            return loss.mean()
+        return loss.sum()
+
diff --git a/maskrcnn_benchmark/layers/sigmoid_focal_loss.py b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
new file mode 100644
index 000000000..fc7e37a74
--- /dev/null
+++ b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
@@ -0,0 +1,58 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from maskrcnn_benchmark import _C
+
+
+class _SigmoidFocalLoss(Function):
+    @staticmethod
+    def forward(ctx, logits, targets, num_classes, gamma, alpha):
+        ctx.save_for_backward(logits, targets);
+        ctx.num_classes = num_classes
+        ctx.gamma = gamma
+        ctx.alpha = alpha
+
+        losses = _C.sigmoid_focalloss_forward(
+            logits, targets, num_classes, gamma, alpha
+        )
+        return losses
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, d_loss):
+        logits, targets = ctx.saved_tensors
+        num_classes = ctx.num_classes
+        gamma = ctx.gamma
+        alpha = ctx.alpha
+        d_loss = d_loss.contiguous()
+        d_logits = _C.sigmoid_focalloss_backward(
+            logits, targets, d_loss, num_classes, gamma, alpha
+        )
+        return d_logits, None, None, None, None
+
+
+sigmoid_focalloss = _SigmoidFocalLoss.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+    def __init__(self, num_classes, gamma, alpha):
+        super(SigmoidFocalLoss, self).__init__()
+        self.num_classes = num_classes
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def forward(self, logits, targets):
+        loss = sigmoid_focalloss(
+            logits, targets, self.num_classes, self.gamma, self.alpha
+        )
+        return loss.sum()
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "num_classes=" + str(self.num_classes)
+        tmpstr += ", gamma=" + str(self.gamma)
+        tmpstr += ", alpha=" + str(self.alpha)
+        tmpstr += ")"
+        return tmpstr
diff --git a/maskrcnn_benchmark/layers/smooth_l1_loss.py b/maskrcnn_benchmark/layers/smooth_l1_loss.py
index 9c4664bb4..294deec4c 100644
--- a/maskrcnn_benchmark/layers/smooth_l1_loss.py
+++ b/maskrcnn_benchmark/layers/smooth_l1_loss.py
@@ -2,6 +2,15 @@
 import torch
 
 
+class SmoothL1Loss(torch.nn.Module):
+    def __init__(self, beta=1. /9):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+
+    def forward(self, input, target, size_average=True):
+        return smooth_l1_loss(input, target, self.beta, size_average)
+
+
 # TODO maybe push this to nn?
 def smooth_l1_loss(input, target, beta=1. / 9, size_average=True):
     """
diff --git a/maskrcnn_benchmark/layers/sparse_select.py b/maskrcnn_benchmark/layers/sparse_select.py
new file mode 100644
index 000000000..09d24365e
--- /dev/null
+++ b/maskrcnn_benchmark/layers/sparse_select.py
@@ -0,0 +1,51 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from maskrcnn_benchmark import _C
+
+
+class _SparseSelect(Function):
+    @staticmethod
+    def forward(ctx, features, batches, offsets, kernel_size):
+        ctx.save_for_backward(features, batches, offsets);
+        ctx.kernel_size = kernel_size
+
+        outputs = _C.sparse_select_forward(
+            features, batches, offsets, kernel_size
+        )
+        return outputs
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, d_outputs):
+        features, batches, offsets = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        d_outputs = d_outputs.contiguous()
+        d_features = features.clone()
+        _C.sparse_select_backward(
+            d_features, batches, offsets, kernel_size, d_outputs
+        )
+        return d_features, None, None, None
+
+
+sparse_select = _SparseSelect.apply
+
+
+class SparseSelect(nn.Module):
+    def __init__(self, kernel_size):
+        super(SparseSelect, self).__init__()
+        self.kernel_size = kernel_size
+
+    def forward(self, features,  batches, offsets):
+        sparse_features = sparse_select(
+            features, batches, offsets, self.kernel_size
+        )
+        return sparse_features
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "kernel_size=" + str(self.kernel_size)
+        tmpstr += ")"
+        return tmpstr
diff --git a/maskrcnn_benchmark/modeling/backbone/backbone.py b/maskrcnn_benchmark/modeling/backbone/backbone.py
index 0af09683c..5e1e71f14 100644
--- a/maskrcnn_benchmark/modeling/backbone/backbone.py
+++ b/maskrcnn_benchmark/modeling/backbone/backbone.py
@@ -26,12 +26,61 @@ def build_resnet_fpn_backbone(cfg):
         ],
         out_channels=out_channels,
         top_blocks=fpn_module.LastLevelMaxPool(),
+        use_gn=cfg.MODEL.USE_GN
     )
     model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
     return model
 
 
-_BACKBONES = {"resnet": build_resnet_backbone, "resnet-fpn": build_resnet_fpn_backbone}
+def build_resnet_fpn_p3p7_backbone(cfg):
+    body = resnet.ResNet(cfg)
+    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    out_channels = cfg.MODEL.BACKBONE.OUT_CHANNELS
+    fpn = fpn_module.FPN(
+        in_channels_list=[
+            0,
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ],
+        out_channels=out_channels,
+        top_blocks=fpn_module.LastLevelP6P7(out_channels),
+        use_gn=cfg.MODEL.USE_GN
+    )
+    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
+    return model
+
+
+_BACKBONES = {"resnet": build_resnet_backbone,
+              "resnet-fpn": build_resnet_fpn_backbone,
+              "resnet-fpn-retina": build_resnet_fpn_p3p7_backbone,
+             }
+
+
+def build_resnet_fpn_p2p7_backbone(cfg):
+    body = resnet.ResNet(cfg)
+    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    out_channels = cfg.MODEL.BACKBONE.OUT_CHANNELS
+    fpn = fpn_module.FPN(
+        in_channels_list=[
+            in_channels_stage2,
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ],
+        out_channels=out_channels,
+        top_blocks=fpn_module.LastLevelP6P7(out_channels),
+        use_gn=cfg.MODEL.USE_GN
+    )
+    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
+    return model
+
+
+_BACKBONES = {"resnet": build_resnet_backbone,
+              "resnet-fpn": build_resnet_fpn_backbone,
+              "resnet-fpn-retina": build_resnet_fpn_p3p7_backbone,
+             }
+
 
 
 def build_backbone(cfg):
@@ -40,5 +89,14 @@ def build_backbone(cfg):
     ), "Only ResNet and ResNeXt models are currently implemented"
     # Models using FPN end with "-FPN"
     if cfg.MODEL.BACKBONE.CONV_BODY.endswith("-FPN"):
-        return build_resnet_fpn_backbone(cfg)
+        if cfg.RETINANET.RETINANET_ON:
+            if cfg.RETINANET.BACKBONE == "p3p7":
+                return build_resnet_fpn_p3p7_backbone(cfg)
+            elif cfg.RETINANET.BACKBONE == "p2p7":
+                return build_resnet_fpn_p2p7_backbone(cfg)
+            else:
+                raise Exception("Wrong Setting {}:{}".format(
+                    'cfg.RETINANET.BACKBONE', cfg.RETINANET.BACKBBACKBONE))
+        else:
+            return build_resnet_fpn_backbone(cfg)
     return build_resnet_backbone(cfg)
diff --git a/maskrcnn_benchmark/modeling/backbone/fpn.py b/maskrcnn_benchmark/modeling/backbone/fpn.py
index c9ee8c674..b7dbd5a1f 100644
--- a/maskrcnn_benchmark/modeling/backbone/fpn.py
+++ b/maskrcnn_benchmark/modeling/backbone/fpn.py
@@ -11,7 +11,8 @@ class FPN(nn.Module):
     order, and must be consecutive
     """
 
-    def __init__(self, in_channels_list, out_channels, top_blocks=None):
+    def __init__(self, in_channels_list, out_channels, top_blocks=None,
+                 use_gn=False):
         """
         Arguments:
             in_channels_list (list[int]): number of channels for each feature map that
@@ -24,16 +25,37 @@ def __init__(self, in_channels_list, out_channels, top_blocks=None):
         super(FPN, self).__init__()
         self.inner_blocks = []
         self.layer_blocks = []
+        # If in_channels is 0, it would be used. 
+        self.valid_layers = [i > 0 for i in in_channels_list]
         for idx, in_channels in enumerate(in_channels_list, 1):
             inner_block = "fpn_inner{}".format(idx)
             layer_block = "fpn_layer{}".format(idx)
-            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
-            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
+
+            if in_channels == 0:
+                continue
+
+            if use_gn:
+                inner_block_module = nn.Sequential(
+                    nn.Conv2d(in_channels, out_channels, 1),
+                    nn.GroupNorm(32, out_channels))
+                layer_block_module = nn.Sequential(
+                    nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+                    nn.GroupNorm(32, out_channels))
+            else:
+                inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
+                layer_block_module = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
+
             for module in [inner_block_module, layer_block_module]:
-                # Caffe2 implementation uses XavierFill, which in fact
-                # corresponds to kaiming_uniform_ in PyTorch
-                nn.init.kaiming_uniform_(module.weight, a=1)
-                nn.init.constant_(module.bias, 0)
+                for m in module.modules():
+                    if isinstance(m, nn.Conv2d):
+                        # Caffe2 implementation uses XavierFill, which in fact
+                        # corresponds to kaiming_uniform_ in PyTorch
+                        nn.init.kaiming_uniform_(m.weight, a=1)
+                        nn.init.constant_(m.bias, 0)
+                    if isinstance(m, nn.GroupNorm):
+                        nn.init.constant_(m.weight, 1.0)
+                        nn.init.constant_(m.bias, 0)
+
             self.add_module(inner_block, inner_block_module)
             self.add_module(layer_block, layer_block_module)
             self.inner_blocks.append(inner_block)
@@ -54,13 +76,14 @@ def forward(self, x):
         for feature, inner_block, layer_block in zip(
             x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1]
         ):
-            inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")
-            inner_lateral = getattr(self, inner_block)(feature)
-            # TODO use size instead of scale to make it robust to different sizes
-            # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:],
-            # mode='bilinear', align_corners=False)
-            last_inner = inner_lateral + inner_top_down
-            results.insert(0, getattr(self, layer_block)(last_inner))
+            if len(inner_block):
+                inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")
+                inner_lateral = getattr(self, inner_block)(feature)
+                # TODO use size instead of scale to make it robust to different sizes
+                # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:],
+                # mode='bilinear', align_corners=False)
+                last_inner = inner_lateral + inner_top_down
+                results.insert(0, getattr(self, layer_block)(last_inner))
 
         if self.top_blocks is not None:
             last_results = self.top_blocks(results[-1])
@@ -72,3 +95,21 @@ def forward(self, x):
 class LastLevelMaxPool(nn.Module):
     def forward(self, x):
         return [F.max_pool2d(x, 1, 2, 0)]
+
+
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7.
+    """
+    def __init__(self, out_channels):
+        super(LastLevelP6P7, self).__init__()
+        self.p6 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            nn.init.kaiming_uniform_(module.weight, a=1)
+            nn.init.constant_(module.bias, 0)
+
+    def forward(self, x):
+        p6 = self.p6(x)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
diff --git a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
index 1c9953f14..c0bd00444 100644
--- a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
+++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
@@ -46,8 +46,8 @@ def __call__(self, matched_idxs):
             num_neg = min(negative.numel(), num_neg)
 
             # randomly select positive and negative examples
-            perm1 = torch.randperm(positive.numel())[:num_pos]
-            perm2 = torch.randperm(negative.numel())[:num_neg]
+            perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+            perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
 
             pos_idx_per_image = positive[perm1]
             neg_idx_per_image = negative[perm2]
diff --git a/maskrcnn_benchmark/modeling/detector/detectors.py b/maskrcnn_benchmark/modeling/detector/detectors.py
index af2100cac..33ca7353d 100644
--- a/maskrcnn_benchmark/modeling/detector/detectors.py
+++ b/maskrcnn_benchmark/modeling/detector/detectors.py
@@ -1,8 +1,9 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 from .generalized_rcnn import GeneralizedRCNN
+from .retinanet import RetinaNet
 
-
-_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN}
+_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN,
+                                 "RetinaNet": RetinaNet}
 
 
 def build_detection_model(cfg):
diff --git a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
index 63b5868f1..9ac5de704 100644
--- a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
+++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
@@ -10,6 +10,7 @@
 
 from ..backbone import build_backbone
 from ..rpn.rpn import build_rpn
+from ..rpn.retinanet import build_retinanet
 from ..roi_heads.roi_heads import build_roi_heads
 
 
@@ -27,7 +28,10 @@ def __init__(self, cfg):
         super(GeneralizedRCNN, self).__init__()
 
         self.backbone = build_backbone(cfg)
-        self.rpn = build_rpn(cfg)
+        if not cfg.RETINANET.RETINANET_ON:
+            self.rpn = build_rpn(cfg)
+        else:
+            self.rpn = build_retinanet(cfg)
         self.roi_heads = build_roi_heads(cfg)
 
     def forward(self, images, targets=None):
diff --git a/maskrcnn_benchmark/modeling/detector/retinanet.py b/maskrcnn_benchmark/modeling/detector/retinanet.py
new file mode 100644
index 000000000..6fc12a408
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/detector/retinanet.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Implements the Generalized R-CNN framework
+"""
+
+import torch
+from torch import nn
+
+from maskrcnn_benchmark.structures.image_list import to_image_list
+
+from ..backbone import build_backbone
+from ..rpn.retinanet import build_retinanet
+from maskrcnn_benchmark.modeling.roi_heads.mask_head.mask_head import build_roi_mask_head
+from maskrcnn_benchmark.modeling.roi_heads.sparsemask_head.mask_head import build_sparse_mask_head
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+import copy
+
+class RetinaNet(nn.Module):
+    """
+    Main class for RetinaNet
+    It consists of three main parts:
+    - backbone
+    - bbox_heads: BBox prediction.
+    - Mask_heads:
+    """
+
+    def __init__(self, cfg):
+        super(RetinaNet, self).__init__()
+        self.cfg = copy.deepcopy(cfg)
+        self.backbone = build_backbone(cfg)
+        self.rpn = build_retinanet(cfg)
+        self.mask = None
+        if cfg.MODEL.MASK_ON:
+            self.mask = build_roi_mask_head(cfg)
+        if cfg.MODEL.SPARSE_MASK_ON:
+            self.mask = build_sparse_mask_head(cfg)
+
+
+    def forward(self, images, targets=None):
+        """
+        Arguments:
+            images (list[Tensor] or ImageList): images to be processed
+            targets (list[BoxList]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        if self.training and targets is None:
+            raise ValueError("In training mode, targets should be passed")
+        images = to_image_list(images)
+        features = self.backbone(images.tensors)
+
+        # Retina RPN Output
+        rpn_features = features
+        if self.cfg.RETINANET.BACKBONE == "p2p7":
+            rpn_features = features[1:]
+        (anchors, detections), detector_losses = self.rpn(images, rpn_features, targets)
+        if self.training:
+            losses = {}
+            losses.update(detector_losses)
+            if self.mask:
+                if self.cfg.MODEL.MASK_ON:
+                    # Padding the GT
+                    proposals = []
+                    for (image_detections, image_targets) in zip(
+                        detections, targets):
+                        merge_list = []
+                        if not isinstance(image_detections, list):
+                            merge_list.append(image_detections.copy_with_fields('labels'))
+
+                        if not isinstance(image_targets, list):
+                            merge_list.append(image_targets.copy_with_fields('labels'))
+
+                        if len(merge_list) == 1:
+                            proposals.append(merge_list[0])
+                        else:
+                            proposals.append(cat_boxlist(merge_list))
+                    x, result, mask_losses = self.mask(features, proposals, targets)
+                elif self.cfg.MODEL.SPARSE_MASK_ON:
+                    x, result, mask_losses = self.mask(features, anchors, targets)
+
+                losses.update(mask_losses)
+            return losses
+        else:
+            if self.mask:
+                proposals = []
+                for image_detections in detections:
+                    num_of_detections = image_detections.bbox.shape[0]
+                    if num_of_detections > self.cfg.RETINANET.NUM_MASKS_TEST > 0:
+                        cls_scores = image_detections.get_field("scores")
+                        image_thresh, _ = torch.kthvalue(
+                            cls_scores.cpu(), num_of_detections - \
+                            self.cfg.RETINANET.NUM_MASKS_TEST + 1
+                        )
+                        keep = cls_scores >= image_thresh.item()
+                        keep = torch.nonzero(keep).squeeze(1)
+                        image_detections = image_detections[keep]
+
+                    proposals.append(image_detections)
+
+                if self.cfg.MODEL.SPARSE_MASK_ON:
+                    x, detections, mask_losses = self.mask(
+                        features, proposals, targets
+                    )
+                else:
+                    x, detections, mask_losses = self.mask(features, proposals, targets)
+            return detections
+
diff --git a/maskrcnn_benchmark/modeling/matcher.py b/maskrcnn_benchmark/modeling/matcher.py
index e051d3f59..613241676 100644
--- a/maskrcnn_benchmark/modeling/matcher.py
+++ b/maskrcnn_benchmark/modeling/matcher.py
@@ -20,7 +20,8 @@ class Matcher(object):
     BELOW_LOW_THRESHOLD = -1
     BETWEEN_THRESHOLDS = -2
 
-    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
+    def __init__(self, high_threshold, low_threshold,
+                 allow_low_quality_matches=False, low_quality_threshold=0.0):
         """
         Args:
             high_threshold (float): quality values greater than or equal to
@@ -38,6 +39,7 @@ def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=Fals
         self.high_threshold = high_threshold
         self.low_threshold = low_threshold
         self.allow_low_quality_matches = allow_low_quality_matches
+        self.low_quality_threshold = low_quality_threshold
 
     def __call__(self, match_quality_matrix):
         """
@@ -84,6 +86,11 @@ def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
         """
         # For each gt, find the prediction with which it has highest quality
         highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+
+        if self.low_quality_threshold > 0.0:
+            select = highest_quality_foreach_gt >= self.low_quality_threshold
+            highest_quality_foreach_gt = highest_quality_foreach_gt[select]
+            match_quality_matrix = match_quality_matrix[select]
         # Find highest quality match available, even if it is low, including ties
         gt_pred_pairs_of_highest_quality = torch.nonzero(
             match_quality_matrix == highest_quality_foreach_gt[:, None]
diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py
index bf0f4b8a6..eb0478a1c 100644
--- a/maskrcnn_benchmark/modeling/poolers.py
+++ b/maskrcnn_benchmark/modeling/poolers.py
@@ -52,7 +52,7 @@ class Pooler(nn.Module):
     which is available thanks to the BoxList.
     """
 
-    def __init__(self, output_size, scales, sampling_ratio):
+    def __init__(self, output_size, scales, sampling_ratio, canonical_level=4):
         """
         Arguments:
             output_size (list[tuple[int]] or list[int]): output size for the pooled region
@@ -73,7 +73,9 @@ def __init__(self, output_size, scales, sampling_ratio):
         # downsamples by a factor of 2 at each level.
         lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item()
         lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item()
-        self.map_levels = LevelMapper(lvl_min, lvl_max)
+        self.map_levels = LevelMapper(
+            lvl_min, lvl_max, canonical_level=canonical_level
+        )
 
     def convert_to_roi_format(self, boxes):
         concat_boxes = cat([b.bbox for b in boxes], dim=0)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py
index 79eb9ac25..e05fcbb1d 100644
--- a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py
@@ -17,10 +17,10 @@ def __init__(self, config, pretrained=None):
         self.bbox_pred = nn.Linear(num_inputs, num_classes * 4)
 
         nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
-        nn.init.constant_(self.cls_score.weight, 0)
+        nn.init.constant_(self.cls_score.bias, 0)
 
         nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001)
-        nn.init.constant_(self.bbox_pred.weight, 0)
+        nn.init.constant_(self.bbox_pred.bias, 0)
 
     def forward(self, x):
         x = self.avgpool(x)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py
index 66f2c2665..162944a0e 100644
--- a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py
@@ -28,6 +28,7 @@ def __init__(self, cfg):
             output_size=(resolution, resolution),
             scales=scales,
             sampling_ratio=sampling_ratio,
+            canonical_level=cfg.MODEL.ROI_MASK_HEAD.CANONICAL_LEVEL,
         )
         input_size = cfg.MODEL.BACKBONE.OUT_CHANNELS
         self.pooler = pooler
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/loss.py
new file mode 100644
index 000000000..639f51d6b
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/loss.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.modeling.utils import cat
+
+import visdom
+
+def project_masks_on_boxes(segmentation_masks, proposals, discretization_size):
+    """
+    Given segmentation masks and the bounding boxes corresponding
+    to the location of the masks in the image, this function
+    crops and resizes the masks in the position defined by the
+    boxes. This prepares the masks for them to be fed to the
+    loss computation as the targets.
+
+    Arguments:
+        segmentation_masks: an instance of SegmentationMask
+        proposals: an instance of BoxList
+    """
+    masks = []
+    M = discretization_size
+    device = proposals.bbox.device
+    proposals = proposals.convert("xyxy")
+    assert segmentation_masks.size == proposals.size, "{}, {}".format(
+        segmentation_masks, proposals
+    )
+    # TODO put the proposals on the CPU, as the representation for the
+    # masks is not efficient GPU-wise (possibly several small tensors for
+    # representing a single instance mask)
+    proposals = proposals.bbox.to(torch.device("cpu"))
+    for segmentation_mask, proposal in zip(segmentation_masks, proposals):
+        # crop the masks, resize them to the desired resolution and
+        # then convert them to the tensor representation,
+        # instead of the list representation that was used
+        cropped_mask = segmentation_mask.crop(proposal)
+        scaled_mask = cropped_mask.resize((M, M))
+        mask = scaled_mask.convert(mode="mask")
+        masks.append(mask)
+    if len(masks) == 0:
+        return torch.empty(0, dtype=torch.float32, device=device)
+    return torch.stack(masks, dim=0).to(device, dtype=torch.float32)
+
+
+class SparseMaskLossComputation(object):
+    def __init__(self, cfg, discretization_size):
+        """
+        Arguments:
+            discretization_size (int)
+        """
+        self.cfg = cfg.clone()
+        self.discretization_size = discretization_size
+
+    def match_targets_to_proposals(self, proposal, target):
+        match_quality_matrix = boxlist_iou(target, proposal)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        # Mask RCNN needs "labels" and "masks "fields for creating the targets
+        target = target.copy_with_fields(["labels", "masks"])
+        # get the targets corresponding GT for each proposal
+        # NB: need to clamp the indices because we can have a single
+        # GT in the image, and matched_idxs can be -2, which goes
+        # out of bounds
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, targets):
+        labels = []
+        masks = []
+        for targets_per_image in targets:
+
+            labels_per_image = targets_per_image.get_field("labels")
+            labels_per_image = labels_per_image.to(dtype=torch.int64)
+
+            segmentation_masks = targets_per_image.get_field("masks")
+
+            masks_per_image = project_masks_on_boxes(
+                segmentation_masks, targets_per_image, self.discretization_size
+            )
+
+            labels.append(labels_per_image)
+            masks.append(masks_per_image)
+
+        return labels, masks
+
+    def __call__(self, mask_logits, targets):
+        """
+        Arguments:
+            mask_logits (Tensor)
+            targets (list[BoxList])
+
+        Return:
+            mask_loss (Tensor): scalar tensor containing the loss
+        """
+        labels, mask_targets = self.prepare_targets(targets)
+        labels = cat(labels, dim=0)
+        mask_targets = cat(mask_targets, dim=0)
+
+
+        # torch.mean (in binary_cross_entropy_with_logits) doesn't
+        # accept empty tensors, so handle it separately
+        if mask_targets.numel() == 0:
+            return mask_logits.sum() * 0
+
+        mask_loss = F.binary_cross_entropy_with_logits(
+            mask_logits, mask_targets
+        ) 
+        if self.cfg.DEBUG:
+            vis = visdom.Visdom(server='http://bvision9.cs.unc.edu')
+            vis.images(mask_targets.unsqueeze(1), win='mask_targets')
+            vis.images(F.sigmoid(mask_logits).unsqueeze(1), win='mask_logits')
+
+        return mask_loss
+
+
+def make_sparse_mask_loss_evaluator(cfg):
+    loss_evaluator = SparseMaskLossComputation(
+        cfg,
+        28
+    )
+
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_feature_extractors.py
new file mode 100644
index 000000000..6b0e5353b
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_feature_extractors.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.layers import Conv2d
+from maskrcnn_benchmark.layers import SparseSelect
+
+class SparseMaskFPNFeatureExtractor(nn.Module):
+    """
+    Heads for FPN for classification
+    """
+
+    def __init__(self, cfg):
+        """
+        Arguments:
+            num_classes (int): number of output classes
+            input_size (int): number of channels of the input once it's flattened
+            representation_size (int): size of the intermediate representation
+        """
+        super(SparseMaskFPNFeatureExtractor, self).__init__()
+
+        #resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+        #scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES
+        #sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+        #pooler = Pooler(
+        #    output_size=(resolution, resolution),
+        #    scales=scales,
+        #    sampling_ratio=sampling_ratio,
+        #)
+        input_size = cfg.MODEL.BACKBONE.OUT_CHANNELS
+        #self.pooler = pooler
+        layers = cfg.MODEL.SPARSE_MASK_HEAD.CONV_LAYERS
+
+        next_feature = input_size
+        self.blocks = []
+        self.gn = []
+        for layer_idx, layer_features in enumerate(layers, 1):
+            layer_name = "sparsemask_fcn{}".format(layer_idx)
+            module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1)
+            # Caffe2 implementation uses MSRAFill, which in fact
+            # corresponds to kaiming_normal_ in PyTorch
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            nn.init.constant_(module.bias, 0)
+            self.add_module(layer_name, module)
+            next_feature = layer_features
+            self.blocks.append(layer_name)
+
+            if cfg.MODEL.USE_GN:
+                layer_name = "sparsemask_gn{}".format(layer_idx)
+                module = nn.GroupNorm(32, layer_features)
+                nn.init.constant_(module.weight, 1)
+                nn.init.constant_(module.bias, 0)
+                self.add_module(layer_name, module)
+                self.gn.append(layer_name)
+
+    def forward(self, features, batch_idx, layer_idx,  locations):
+
+        # Before Sparse Selection
+        index = torch.arange(0, len(batch_idx), dtype=torch.long).to(batch_idx.device)
+        post_index = []
+        post_features = []
+        for layer,  feature in enumerate(features):
+            x = feature
+            for idx, layer_name in enumerate(self.blocks):
+                x = getattr(self, layer_name)(x)
+                if self.gn:
+                    x = getattr(self, self.gn[idx])(x)
+
+                x = F.relu(x)
+
+            select = layer_idx == layer
+            if select.sum().item():
+                select_features = SparseSelect(3)(x, batch_idx[select].int(),
+                                                  locations[select].int())
+                post_features.append(select_features)
+                post_index.append(index[select])
+
+        post_index = torch.cat(post_index)
+        post_features = torch.cat(post_features, 0)
+        post_features = post_features[post_index]
+        return post_features
+
+
+_SPARSE_MASK_FEATURE_EXTRACTORS = {
+    "SparseMaskFPNFeatureExtractor": SparseMaskFPNFeatureExtractor,
+}
+
+
+def make_sparse_mask_feature_extractor(cfg):
+    func = _SPARSE_MASK_FEATURE_EXTRACTORS[cfg.MODEL.SPARSE_MASK_HEAD.FEATURE_EXTRACTOR]
+    return func(cfg)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_head.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_head.py
new file mode 100644
index 000000000..4246303ac
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_head.py
@@ -0,0 +1,98 @@
+import torch
+from torch import nn
+import numpy as np
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+from .mask_feature_extractors import make_sparse_mask_feature_extractor
+from .mask_predictors import make_sparse_mask_predictor
+#from .inference import make_roi_mask_post_processor
+from .loss import make_sparse_mask_loss_evaluator
+from .mask_matcher import generate_best_matching
+
+
+class SparseMaskHead(torch.nn.Module):
+    def __init__(self, cfg):
+        super(SparseMaskHead, self).__init__()
+        self.cfg = cfg.clone()
+        self.feature_extractor = make_sparse_mask_feature_extractor(cfg)
+        self.num_anchors = len(cfg.RETINANET.ASPECT_RATIOS) \
+                            * cfg.RETINANET.SCALES_PER_OCTAVE
+        self.predictor = make_sparse_mask_predictor(cfg)
+        #self.post_processor = make_roi_mask_post_processor(cfg)
+        self.loss_evaluator = make_sparse_mask_loss_evaluator(cfg)
+        self.discretize = 28
+
+    def change_to_anchor_masks(self, anchors, scale=1.5):
+        for i in range(len(anchors)):
+            temp = anchors[i].bbox
+            hw = temp[:, 2:] - temp[:, 0:2] 
+            xy  = (temp[:, 2:] + temp[:, :2]) * 0.5
+            hw  *= scale
+            temp[:, 0:2] = xy - hw*0.5
+            temp[:, 2:] = xy + hw*0.5
+            anchors[i].bbox = temp
+        return anchors
+
+    def forward(self, features, anchors, targets=None):
+        """
+        Arguments:
+            features (list[Tensor]): feature-maps from possibly several levels
+            anchors (list[BoxList]): anchor boxes
+            targets (list[BoxList], optional): the ground-truth targets.
+
+        Returns:
+            x (Tensor): the result of the feature extractor
+            proposals (list[BoxList]): during training, the original proposals
+                are returned. During testing, the predicted boxlists are returned
+                with the `mask` field set
+            losses (dict[Tensor]): During training, returns the losses for the
+                head. During testing, returns an empty dict.
+        """
+        device = features[0].device
+        if self.training:
+            with torch.no_grad():
+                sparse_codes, sparse_anchors = generate_best_matching(
+                    anchors, targets, self.training)
+                sparse_batch = [torch.LongTensor(s.size()).fill_(idx) for idx, s in enumerate(sparse_codes)]
+                sparse_batch = torch.cat(sparse_batch).to(device)
+                sparse_codes = torch.cat(sparse_codes)
+                layers  = torch.cat(
+                    [torch.empty(f.size(2)*f.size(3),
+                                 dtype=torch.long).fill_(l) for
+                                     l, f in enumerate(features)])
+                sparse_layers = layers[sparse_codes / self.num_anchors]
+                layer_base  = torch.LongTensor(
+                    [0] + [f.size(2)*f.size(3) for f in
+                           features]).cumsum(0).to(device)
+                sparse_off = (sparse_codes / self.num_anchors) - layer_base[sparse_layers]
+                sparse_anchor_idx = sparse_codes % self.num_anchors
+        else:
+            sparse_batch = torch.cat([a.get_field('sparse_batch') for a \
+                                     in anchors])
+            sparse_layers = torch.cat([a.get_field('sparse_layers') for a \
+                                     in anchors])
+            sparse_off = torch.cat([a.get_field('sparse_off') for a \
+                                     in anchors])
+            sparse_anchor_idx = torch.cat(
+                [ a.get_field('sparse_anchor_idx') for a in anchors])
+
+            sparse_anchors = [BoxList(a.get_field('sparse_anchors'), a.size, mode="xyxy") for a in anchors]
+
+        x = self.feature_extractor(
+            features, sparse_batch,  sparse_layers, sparse_off
+        )
+
+        logits = self.predictor(x, sparse_anchor_idx)
+        logits = logits.view(-1, self.discretize, self.discretize)
+        sparse_anchors = self.change_to_anchor_masks(sparse_anchors)
+
+
+        if not self.training:
+            return features, logits, {}
+
+        loss_mask = self.loss_evaluator(logits, sparse_anchors)
+        return features, sparse_anchors, dict(loss_mask=loss_mask)
+
+def build_sparse_mask_head(cfg):
+    return SparseMaskHead(cfg)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_matcher.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_matcher.py
new file mode 100644
index 000000000..2e5c0f54d
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_matcher.py
@@ -0,0 +1,29 @@
+import torch
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+
+
+# Generate the best matching
+def generate_best_matching(anchors, targets, training=True):
+    results  = []
+    sparse_anchors =  []
+    for anchors_per_image,  targets_per_image in zip(anchors,  targets):
+        anchors_per_image = cat_boxlist(anchors_per_image)
+        match_quality_matrix = boxlist_iou(targets_per_image, anchors_per_image)
+        iou, matches_idx = match_quality_matrix.max(dim=1)
+        select = iou >= 0.5
+        matches_idx = matches_idx[select]
+
+        results.append(matches_idx)
+        sparse_anchors_per_image =  anchors_per_image[matches_idx]
+        if training :
+            sparse_anchors_per_image.add_field(
+                'masks',
+                targets_per_image.get_field('masks')[select])
+            sparse_anchors_per_image.add_field(
+                'labels',
+                targets_per_image.get_field('labels')[select])
+
+        sparse_anchors.append(sparse_anchors_per_image)
+ 
+    return results, sparse_anchors
diff --git a/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_predictors.py
new file mode 100644
index 000000000..507735bb1
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/sparsemask_head/mask_predictors.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.layers import Conv2d
+from maskrcnn_benchmark.layers import ConvTranspose2d
+
+
+class SparseMaskFCPredictor(nn.Module):
+    def __init__(self, cfg):
+        super(SparseMaskFCPredictor, self).__init__()
+        self.num_anchors = 9
+        num_inputs = 256
+
+        self.mask_fc_logits = []
+        for i in range(self.num_anchors):
+            module = Conv2d(num_inputs, 28*28, 3, 1, 0)
+            layer_name = "sparsemask_pred{}".format(i)
+            self.add_module(layer_name, module)
+            self.mask_fc_logits.append(layer_name)
+
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0)
+            elif "weight" in name:
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+    def forward(self, x, anchor_idx):
+        _, depth, height, width= x.shape
+        index = torch.arange(0, len(anchor_idx),
+                            dtype=torch.long).to(x[0].device)
+        results = []
+        post_index = []
+        for a, layer_name in enumerate(self.mask_fc_logits):
+            select = a == anchor_idx
+            if select.sum().item():
+                results.append(
+                    getattr(self, layer_name)(x[select].view(-1, depth, height, width)))
+                post_index.append(index[select])
+
+        post_index = torch.cat(post_index)
+        results = torch.cat(results, 0)
+        results = results[post_index]
+        return results
+
+
+_SPARSE_MASK_PREDICTOR = {"SparseMaskFCPredictor": SparseMaskFCPredictor}
+
+def make_sparse_mask_predictor(cfg):
+    #func = _ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR]
+    func = _SPARSE_MASK_PREDICTOR["SparseMaskFCPredictor"]
+    return func(cfg)
diff --git a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
index c3c32a905..db89911a7 100644
--- a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
+++ b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
@@ -54,8 +54,13 @@ def __init__(
         else:
             if len(anchor_strides) != len(sizes):
                 raise RuntimeError("FPN should have #anchor_strides == #sizes")
+
             cell_anchors = [
-                generate_anchors(anchor_stride, (size,), aspect_ratios).float()
+                generate_anchors(
+                    anchor_stride,
+                    size if type(size) is tuple else (size,),
+                    aspect_ratios
+                ).float()
                 for anchor_stride, size in zip(anchor_strides, sizes)
             ]
         self.strides = anchor_strides
@@ -139,6 +144,28 @@ def make_anchor_generator(config):
     return anchor_generator
 
 
+def make_anchor_generator_retinanet(config):
+    anchor_sizes = config.RETINANET.ANCHOR_SIZES
+    aspect_ratios = config.RETINANET.ASPECT_RATIOS
+    anchor_strides = config.RETINANET.ANCHOR_STRIDES
+    straddle_thresh = config.RETINANET.STRADDLE_THRESH
+    octave = config.RETINANET.OCTAVE
+    scales_per_octave = config.RETINANET.SCALES_PER_OCTAVE
+
+    assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now"
+    new_anchor_sizes = []
+    for size in anchor_sizes:
+        per_layer_anchor_sizes = []
+        for scale_per_octave in range(scales_per_octave):
+            octave_scale = octave ** (scale_per_octave / float(scales_per_octave))
+            per_layer_anchor_sizes.append(octave_scale * size)
+        new_anchor_sizes.append(tuple(per_layer_anchor_sizes))
+
+    anchor_generator = AnchorGenerator(
+        tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh
+    )
+    return anchor_generator
+
 # Copyright (c) 2017-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet.py b/maskrcnn_benchmark/modeling/rpn/retinanet.py
new file mode 100644
index 000000000..c70e2d355
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet.py
@@ -0,0 +1,189 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from .retinanet_loss import make_retinanet_loss_evaluator
+from .anchor_generator import make_anchor_generator_retinanet
+from .retinanet_infer import  make_retinanet_postprocessor
+from .retinanet_detail_infer import  make_retinanet_detail_postprocessor
+
+
+class RetinaNetHead(torch.nn.Module):
+    """
+    Adds a RetinNet head with classification and regression heads
+    """
+
+    def __init__(self, cfg):
+        """
+        Arguments:
+            in_channels (int): number of channels of the input feature
+            num_anchors (int): number of anchors to be predicted
+        """
+        super(RetinaNetHead, self).__init__()
+        # TODO: Implement the sigmoid version first.
+        num_classes = cfg.RETINANET.NUM_CLASSES - 1
+        in_channels = cfg.MODEL.BACKBONE.OUT_CHANNELS
+        num_anchors = len(cfg.RETINANET.ASPECT_RATIOS) \
+                        * cfg.RETINANET.SCALES_PER_OCTAVE
+
+        cls_tower = []
+        bbox_tower = []
+        for i in range(cfg.RETINANET.NUM_CONVS):
+            cls_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            if cfg.MODEL.USE_GN:
+                cls_tower.append(nn.GroupNorm(32, in_channels))
+
+            cls_tower.append(nn.ReLU())
+            bbox_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            if cfg.MODEL.USE_GN:
+                bbox_tower.append(nn.GroupNorm(32, in_channels))
+
+            bbox_tower.append(nn.ReLU())
+
+        self.add_module('cls_tower', nn.Sequential(*cls_tower))
+        self.add_module('bbox_tower', nn.Sequential(*bbox_tower))
+        self.cls_logits = nn.Conv2d(
+            in_channels, num_anchors * num_classes, kernel_size=3, stride=1,
+            padding=1
+        )
+        self.bbox_pred = nn.Conv2d(
+            in_channels,  num_anchors * 4, kernel_size=3, stride=1,
+            padding=1
+        )
+
+        # Initialization
+        for modules in [self.cls_tower, self.bbox_tower, self.cls_logits,
+                  self.bbox_pred]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    torch.nn.init.constant_(l.bias, 0)
+                if isinstance(l, nn.GroupNorm):
+                    torch.nn.init.constant_(l.weight, 1.0)
+                    torch.nn.init.constant_(l.bias, 0)
+
+        # retinanet_bias_init
+        prior_prob = cfg.RETINANET.PRIOR_PROB
+        bias_value = -np.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cls_logits.bias, bias_value)
+
+    def forward(self, x):
+        logits = []
+        bbox_reg = []
+        for feature in x:
+            logits.append(self.cls_logits(self.cls_tower(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_tower(feature)))
+        return logits, bbox_reg
+
+
+class RetinaNetModule(torch.nn.Module):
+    """
+    Module for RetinaNet computation. Takes feature maps from the backbone and RPN
+    proposals and losses.
+    """
+
+    def __init__(self, cfg):
+        super(RetinaNetModule, self).__init__()
+
+        self.cfg = cfg.clone()
+
+        anchor_generator = make_anchor_generator_retinanet(cfg)
+        head = RetinaNetHead(cfg)
+        box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+
+        if self.cfg.MODEL.SPARSE_MASK_ON:
+            box_selector_test = make_retinanet_detail_postprocessor(
+                cfg, 100, box_coder)
+        else:
+            box_selector_test = make_retinanet_postprocessor(
+                cfg, 100, box_coder)
+        box_selector_train = None
+        if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.SPARSE_MASK_ON:
+            box_selector_train = make_retinanet_postprocessor(
+                cfg, 100, box_coder)
+
+        loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder)
+
+        self.anchor_generator = anchor_generator
+        self.head = head
+        self.box_selector_test = box_selector_test
+        self.box_selector_train = box_selector_train
+        self.loss_evaluator = loss_evaluator
+
+    def forward(self, images, features, targets=None):
+        """
+        Arguments:
+            images (ImageList): images for which we want to compute the predictions
+            features (list[Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (list[BoxList): ground-truth boxes present in the image (optional)
+
+        Returns:
+            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
+                image.
+            losses (dict[Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        box_cls, box_regression = self.head(features)
+        anchors = self.anchor_generator(images, features)
+ 
+        if self.training:
+            return self._forward_train(anchors, box_cls, box_regression, targets)
+        else:
+            return self._forward_test(anchors, box_cls, box_regression)
+
+    def _forward_train(self, anchors, box_cls, box_regression, targets):
+
+        loss_box_cls, loss_box_reg = self.loss_evaluator(
+            anchors, box_cls, box_regression, targets
+        )
+        losses = {
+            "loss_retina_cls": loss_box_cls,
+            "loss_retina_reg": loss_box_reg,
+        }
+        detections = None
+        if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.SPARSE_MASK_ON:
+            with torch.no_grad():
+                detections = self.box_selector_train(
+                    anchors, box_cls, box_regression
+                )
+
+        return (anchors, detections), losses
+
+    def _forward_test(self, anchors, box_cls, box_regression):
+        boxes = self.box_selector_test(anchors, box_cls, box_regression)
+        '''
+        if self.cfg.MODEL.RPN_ONLY:
+            # For end-to-end models, the RPN proposals are an intermediate state
+            # and don't bother to sort them in decreasing score order. For RPN-only
+            # models, the proposals are the final output and we return them in
+            # high-to-low confidence order.
+            inds = [
+                box.get_field("objectness").sort(descending=True)[1] for box in boxes
+            ]
+            boxes = [box[ind] for box, ind in zip(boxes, inds)]
+        '''
+        return (anchors, boxes), {}
+
+
+def build_retinanet(cfg):
+    return RetinaNetModule(cfg)
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet_detail_infer.py b/maskrcnn_benchmark/modeling/rpn/retinanet_detail_infer.py
new file mode 100644
index 000000000..d48e28b29
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet_detail_infer.py
@@ -0,0 +1,225 @@
+import torch
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes
+
+from ..utils import cat
+
+
+class RetinaNetDetailPostProcessor(torch.nn.Module):
+    """
+    Performs post-processing on the outputs of the RetinaNet boxes.
+    This is only used in the testing.
+    """
+    def __init__(
+        self,
+        pre_nms_thresh,
+        pre_nms_top_n,
+        nms_thresh,
+        fpn_post_nms_top_n,
+        min_size,
+        box_coder=None,
+    ):
+        """
+        Arguments:
+            pre_nms_thresh (float)
+            pre_nms_top_n (int)
+            nms_thresh (float)
+            fpn_post_nms_top_n (int)
+            min_size (int)
+            box_coder (BoxCoder)
+        """
+        super(RetinaNetDetailPostProcessor, self).__init__()
+        self.pre_nms_thresh = pre_nms_thresh
+        self.pre_nms_top_n = pre_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.min_size = min_size
+
+        if box_coder is None:
+            box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+        self.box_coder = box_coder
+
+
+    def forward_for_single_feature_map(self, anchors, box_cls, box_regression):
+        """
+        Arguments:
+            anchors: list[BoxList]
+            box_cls: tensor of size N, A * C, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        device = box_cls.device
+        N, _ , H, W = box_cls.shape
+        A = int(box_regression.size(1) / 4)
+        C = int(box_cls.size(1) / A)
+
+        # put in the same format as anchors
+        box_cls = box_cls.view(N, -1, C, H, W).permute(0, 3, 4, 1, 2)
+        box_cls = box_cls.reshape(N, -1, C)
+        box_cls = box_cls.sigmoid()
+
+        box_regression = box_regression.view(N, -1, 4, H, W)
+        box_regression = box_regression.permute(0, 3, 4, 1, 2)
+        box_regression = box_regression.reshape(N, -1, 4)
+
+        num_anchors = A * H * W
+
+        results = [[] for _ in range(N)]
+        pre_nms_thresh = self.pre_nms_thresh
+        candidate_inds = box_cls > self.pre_nms_thresh
+        if candidate_inds.sum().item() == 0:
+            return results
+
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
+
+        for batch_idx, (per_box_cls, per_box_regression, per_pre_nms_top_n, \
+        per_candidate_inds, per_anchors) in enumerate(zip(
+            box_cls,
+            box_regression,
+            pre_nms_top_n,
+            candidate_inds,
+            anchors)):
+
+            # Sort and select TopN
+            per_box_cls = per_box_cls[per_candidate_inds]
+            per_candidate_nonzeros = per_candidate_inds.nonzero()
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1]
+            per_class += 1
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                        per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_box_loc = per_box_loc[top_k_indices]
+                per_class = per_class[top_k_indices]
+
+            detections = self.box_coder.decode(
+                per_box_regression[per_box_loc, :].view(-1, 4),
+                per_anchors.bbox[per_box_loc, :].view(-1, 4)
+            )
+
+            boxlist = BoxList(detections, per_anchors.size, mode="xyxy")
+            boxlist.add_field("labels", per_class)
+            boxlist.add_field("scores", per_box_cls)
+            boxlist.add_field("sparse_off", per_box_loc / 9)
+            boxlist.add_field("sparse_anchor_idx", per_box_loc % 9)
+            boxlist.add_field("sparse_anchors",
+                              per_anchors.bbox[per_box_loc, :].view(-1, 4))
+            boxlist.add_field("sparse_batch",
+                              per_box_loc.clone().fill_(batch_idx))
+
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = remove_small_boxes(boxlist, self.min_size)
+            results[batch_idx] = boxlist
+
+        return results
+
+    def forward(self, anchors, box_cls, box_regression, targets=None):
+        """
+        Arguments:
+            anchors: list[list[BoxList]]
+            box_cls: list[tensor]
+            box_regression: list[tensor]
+
+        Returns:
+            boxlists (list[BoxList]): the post-processed anchors, after
+                applying box decoding and NMS
+        """
+        sampled_boxes = []
+        num_levels = len(box_cls)
+        anchors = list(zip(*anchors))
+        for a, o, b in zip(anchors, box_cls, box_regression):
+            sampled_boxes.append(self.forward_for_single_feature_map(a, o, b))
+
+        for layer in range(len(sampled_boxes)):
+            for sampled_boxes_per_image in sampled_boxes[layer]:
+                sampled_boxes_per_image.add_field(
+                    'sparse_layers',
+                    sampled_boxes_per_image.get_field('labels').clone().fill_(layer)
+                )
+
+
+        boxlists = list(zip(*sampled_boxes))
+        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
+
+        boxlists = self.select_over_all_levels(boxlists)
+
+        return boxlists
+
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            if len(boxlists[i]) == 0:
+                results.append([])
+                continue
+
+            scores = boxlists[i].get_field("scores")
+            labels = boxlists[i].get_field("labels")
+            boxes = boxlists[i].bbox
+            boxlist = boxlists[i]
+            result = []
+            # skip the background
+            for j in range(1, 81):
+                inds = (labels == j).nonzero().view(-1)
+                if len(inds) == 0:
+                    continue
+
+                boxlist_for_class = boxlist[inds]
+                #scores_j = scores[inds]
+                #boxes_j = boxes[inds, :].view(-1, 4)
+                #boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
+                #boxlist_for_class.add_field("scores", scores_j)
+                boxlist_for_class = boxlist_nms(
+                    boxlist_for_class, self.nms_thresh,
+                    score_field="scores"
+                )
+                num_labels = len(boxlist_for_class)
+                #boxlist_for_class.add_field(
+                #    "labels", torch.full((num_labels,), j,
+                #                         dtype=torch.int64,
+                #                         device=scores.device)
+                #)
+                result.append(boxlist_for_class)
+
+            result = cat_boxlist(result)
+            number_of_detections = len(result)
+
+            # Limit to max_per_image detections **over all classes**
+            if number_of_detections > self.fpn_post_nms_top_n > 0:
+                cls_scores = result.get_field("scores")
+                image_thresh, _ = torch.kthvalue(
+                    cls_scores.cpu(),
+                    number_of_detections - self.fpn_post_nms_top_n + 1
+                )
+                keep = cls_scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                result = result[keep]
+            results.append(result)
+
+        return results
+
+
+def make_retinanet_detail_postprocessor(
+    config, fpn_post_nms_top_n, rpn_box_coder):
+
+    pre_nms_thresh = 0.05
+    pre_nms_top_n = 1000
+    nms_thresh = 0.4
+    fpn_post_nms_top_n = fpn_post_nms_top_n
+    min_size = 0
+
+    # nms_thresh = config.MODEL.RPN.NMS_THRESH
+    # min_size = config.MODEL.RPN.MIN_SIZE
+    box_selector = RetinaNetDetailPostProcessor(
+        pre_nms_thresh=pre_nms_thresh,
+        pre_nms_top_n=pre_nms_top_n,
+        nms_thresh=nms_thresh,
+        fpn_post_nms_top_n=fpn_post_nms_top_n,
+        box_coder=rpn_box_coder,
+        min_size=min_size
+    )
+    return box_selector
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet_infer.py b/maskrcnn_benchmark/modeling/rpn/retinanet_infer.py
new file mode 100644
index 000000000..a75667403
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet_infer.py
@@ -0,0 +1,226 @@
+import torch
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes
+
+from ..utils import cat
+
+
+class RetinaNetPostProcessor(torch.nn.Module):
+    """
+    Performs post-processing on the outputs of the RetinaNet boxes.
+    This is only used in the testing.
+    """
+    def __init__(
+        self,
+        pre_nms_thresh,
+        pre_nms_top_n,
+        nms_thresh,
+        fpn_post_nms_top_n,
+        min_size,
+        box_coder=None,
+    ):
+        """
+        Arguments:
+            pre_nms_thresh (float)
+            pre_nms_top_n (int)
+            nms_thresh (float)
+            fpn_post_nms_top_n (int)
+            min_size (int)
+            box_coder (BoxCoder)
+        """
+        super(RetinaNetPostProcessor, self).__init__()
+        self.pre_nms_thresh = pre_nms_thresh
+        self.pre_nms_top_n = pre_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.min_size = min_size
+
+        if box_coder is None:
+            box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+        self.box_coder = box_coder
+
+
+    def forward_for_single_feature_map(self, anchors, box_cls, box_regression,
+                                      pre_nms_thresh):
+        """
+        Arguments:
+            anchors: list[BoxList]
+            box_cls: tensor of size N, A * C, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        device = box_cls.device
+        N, _ , H, W = box_cls.shape
+        A = int(box_regression.size(1) / 4)
+        C = int(box_cls.size(1) / A)
+
+        # put in the same format as anchors
+        box_cls = box_cls.view(N, -1, C, H, W).permute(0, 3, 4, 1, 2)
+        box_cls = box_cls.reshape(N, -1, C)
+        box_cls = box_cls.sigmoid()
+
+        box_regression = box_regression.view(N, -1, 4, H, W)
+        box_regression = box_regression.permute(0, 3, 4, 1, 2)
+        box_regression = box_regression.reshape(N, -1, 4)
+
+        num_anchors = A * H * W
+
+        results = [[] for _ in range(N)]
+        candidate_inds = box_cls > pre_nms_thresh
+        if candidate_inds.sum().item() == 0:
+            empty_boxlists = []
+            for a in anchors:
+                empty_boxlist = BoxList(torch.Tensor(0, 4).to(device), a.size)
+                empty_boxlist.add_field(
+                    "labels", torch.LongTensor([]).to(device))
+                empty_boxlist.add_field(
+                    "scores", torch.Tensor([]).to(device))
+                empty_boxlists.append(empty_boxlist)
+            return empty_boxlists
+
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
+
+        for batch_idx, (per_box_cls, per_box_regression, per_pre_nms_top_n, \
+        per_candidate_inds, per_anchors) in enumerate(zip(
+            box_cls,
+            box_regression,
+            pre_nms_top_n,
+            candidate_inds,
+            anchors)):
+
+            # Sort and select TopN
+            per_box_cls = per_box_cls[per_candidate_inds]
+            per_candidate_nonzeros = per_candidate_inds.nonzero()
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1]
+            per_class += 1
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                        per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_box_loc = per_box_loc[top_k_indices]
+                per_class = per_class[top_k_indices]
+
+            detections = self.box_coder.decode(
+                per_box_regression[per_box_loc, :].view(-1, 4),
+                per_anchors.bbox[per_box_loc, :].view(-1, 4)
+            )
+
+            boxlist = BoxList(detections, per_anchors.size, mode="xyxy")
+            boxlist.add_field("labels", per_class)
+            boxlist.add_field("scores", per_box_cls)
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = remove_small_boxes(boxlist, self.min_size)
+            results[batch_idx] = boxlist
+
+        return results
+
+    def forward(self, anchors, box_cls, box_regression, targets=None):
+        """
+        Arguments:
+            anchors: list[list[BoxList]]
+            box_cls: list[tensor]
+            box_regression: list[tensor]
+
+        Returns:
+            boxlists (list[BoxList]): the post-processed anchors, after
+                applying box decoding and NMS
+        """
+        sampled_boxes = []
+        num_levels = len(box_cls)
+        anchors = list(zip(*anchors))
+        for layer, (a, o, b) in enumerate(
+            zip(anchors, box_cls, box_regression)):
+            sampled_boxes.append(
+                self.forward_for_single_feature_map(
+                    a, o, b,
+                    self.pre_nms_thresh
+                )
+            )
+
+        boxlists = list(zip(*sampled_boxes))
+        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
+
+        boxlists = self.select_over_all_levels(boxlists)
+
+        return boxlists
+
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            scores = boxlists[i].get_field("scores")
+            labels = boxlists[i].get_field("labels")
+            boxes = boxlists[i].bbox
+            boxlist = boxlists[i]
+            result = []
+            # skip the background
+            for j in range(1, 81):
+                inds = (labels == j).nonzero().view(-1)
+                if len(inds) == 0:
+                    continue
+
+                scores_j = scores[inds]
+                boxes_j = boxes[inds, :].view(-1, 4)
+                boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
+                boxlist_for_class.add_field("scores", scores_j)
+                boxlist_for_class = boxlist_nms(
+                    boxlist_for_class, self.nms_thresh,
+                    score_field="scores"
+                )
+                num_labels = len(boxlist_for_class)
+                boxlist_for_class.add_field(
+                    "labels", torch.full((num_labels,), j,
+                                         dtype=torch.int64,
+                                         device=scores.device)
+                )
+                result.append(boxlist_for_class)
+
+            if len(result) > 0:
+                result = cat_boxlist(result)
+                number_of_detections = len(result)
+
+                # Limit to max_per_image detections **over all classes**
+                if number_of_detections > self.fpn_post_nms_top_n > 0:
+                    cls_scores = result.get_field("scores")
+                    image_thresh, _ = torch.kthvalue(
+                        cls_scores.cpu(),
+                        number_of_detections - self.fpn_post_nms_top_n + 1
+                    )
+                    keep = cls_scores >= image_thresh.item()
+                    keep = torch.nonzero(keep).squeeze(1)
+                    result = result[keep]
+                results.append(result)
+            else:
+                empty_boxlist = BoxList(torch.zeros(1, 4).to('cuda'), boxlist.size)
+                empty_boxlist.add_field(
+                    "labels", torch.LongTensor([1]).to('cuda'))
+                empty_boxlist.add_field(
+                    "scores", torch.Tensor([0.01]).to('cuda'))
+                results.append(empty_boxlist)
+        return results
+
+
+def make_retinanet_postprocessor(
+    config, fpn_post_nms_top_n, rpn_box_coder):
+
+    pre_nms_thresh = 0.05
+    pre_nms_top_n =  config.RETINANET.PRE_NMS_TOP_N
+    nms_thresh = 0.4
+    fpn_post_nms_top_n = fpn_post_nms_top_n
+    min_size = 0
+
+    # nms_thresh = config.MODEL.RPN.NMS_THRESH
+    # min_size = config.MODEL.RPN.MIN_SIZE
+    box_selector = RetinaNetPostProcessor(
+        pre_nms_thresh=pre_nms_thresh,
+        pre_nms_top_n=pre_nms_top_n,
+        nms_thresh=nms_thresh,
+        fpn_post_nms_top_n=fpn_post_nms_top_n,
+        box_coder=rpn_box_coder,
+        min_size=min_size
+    )
+    return box_selector
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet_loss.py b/maskrcnn_benchmark/modeling/rpn/retinanet_loss.py
new file mode 100644
index 000000000..961255209
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet_loss.py
@@ -0,0 +1,171 @@
+"""
+This file contains specific functions for computing losses on the RetinaNet
+file
+"""
+
+import torch
+from torch.nn import functional as F
+
+from ..utils import cat
+
+#from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.layers import SmoothL1Loss
+from maskrcnn_benchmark.layers import AdjustSmoothL1Loss
+from maskrcnn_benchmark.layers import SigmoidFocalLoss
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+
+
+class RetinaNetLossComputation(object):
+    """
+    This class computes the RetinaNet loss.
+    """
+
+    def __init__(self, cfg, proposal_matcher, box_coder):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            box_coder (BoxCoder)
+        """
+        # self.target_preparator = target_preparator
+        self.proposal_matcher = proposal_matcher
+        self.box_coder = box_coder
+        self.num_classes = cfg.RETINANET.NUM_CLASSES -1
+        self.box_cls_loss_func = SigmoidFocalLoss(
+            self.num_classes,
+            cfg.RETINANET.LOSS_GAMMA,
+            cfg.RETINANET.LOSS_ALPHA
+        )
+        if cfg.RETINANET.SELFADJUST_SMOOTH_L1:
+            self.regression_loss = AdjustSmoothL1Loss(
+                4,
+                beta=cfg.RETINANET.BBOX_REG_BETA
+            )
+        else:
+            self.regression_loss = SmoothL1Loss(
+                beta=cfg.RETINANET.BBOX_REG_BETA
+            )
+
+    def match_targets_to_anchors(self, anchor, target):
+        match_quality_matrix = boxlist_iou(target, anchor)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        # RPN doesn't need any fields from target
+        # for creating the labels, so clear them all
+        target = target.copy_with_fields(['labels'])
+        # get the targets corresponding GT for each anchor
+        # NB: need to clamp the indices because we can have a single
+        # GT in the image, and matched_idxs can be -2, which goes
+        # out of bounds
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, anchors, targets):
+        labels = []
+        regression_targets = []
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            matched_targets = self.match_targets_to_anchors(
+                anchors_per_image, targets_per_image
+            )
+
+            matched_idxs = matched_targets.get_field("matched_idxs")
+            labels_per_image = matched_targets.get_field("labels").clone()
+
+            # Background (negative examples)
+            bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
+            labels_per_image[bg_indices] = 0
+
+            # discard indices that are between thresholds 
+            # -1 will be ignored in SigmoidFocalLoss
+            inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
+            labels_per_image[inds_to_discard] = -1
+
+            labels_per_image = labels_per_image.to(dtype=torch.float32)
+            # compute regression targets
+            regression_targets_per_image = self.box_coder.encode(
+                matched_targets.bbox, anchors_per_image.bbox
+            )
+
+            labels.append(labels_per_image)
+            regression_targets.append(regression_targets_per_image)
+
+        return labels, regression_targets
+
+    def __call__(self, anchors, box_cls, box_regression, targets):
+        """
+        Arguments:
+            anchors (list[BoxList])
+            objectness (list[Tensor])
+            box_regression (list[Tensor])
+            targets (list[BoxList])
+
+        Returns:
+            objectness_loss (Tensor)
+            box_loss (Tensor
+        """
+        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
+        labels, regression_targets = self.prepare_targets(anchors, targets)
+
+        # sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        # sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
+        # sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
+
+        # sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
+        num_layers = len(box_cls)
+        box_cls_flattened = []
+        box_regression_flattened = []
+        # for each feature level, permute the outputs to make them be in the
+        # same format as the labels. Note that the labels are computed for
+        # all feature levels concatenated, so we keep the same representation
+        # for the objectness and the box_regression
+        for box_cls_per_level, box_regression_per_level in zip(
+            box_cls, box_regression
+        ):
+            N, A, H, W = box_cls_per_level.shape
+            C = self.num_classes
+            box_cls_per_level = box_cls_per_level.view(N, -1, C, H, W)
+            box_cls_per_level = box_cls_per_level.permute(0, 3, 4, 1, 2)
+            box_cls_per_level = box_cls_per_level.reshape(N, -1, C)
+            box_regression_per_level = box_regression_per_level.view(N, -1, 4, H, W)
+            box_regression_per_level = box_regression_per_level.permute(0, 3, 4, 1, 2)
+            box_regression_per_level = box_regression_per_level.reshape(N, -1, 4)
+            box_cls_flattened.append(box_cls_per_level)
+            box_regression_flattened.append(box_regression_per_level)
+        # concatenate on the first dimension (representing the feature levels), to
+        # take into account the way the labels were generated (with all feature maps
+        # being concatenated as well)
+        box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C)
+        box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4)
+
+        labels = torch.cat(labels, dim=0)
+        regression_targets = torch.cat(regression_targets, dim=0)
+        pos_inds = labels > 0
+
+        retinanet_regression_loss = self.regression_loss(
+            box_regression[pos_inds],
+            regression_targets[pos_inds],
+            size_average=False,
+        ) / (pos_inds.sum() * 4)
+        labels = labels.int()
+
+        retinanet_cls_loss =self.box_cls_loss_func(
+            box_cls,
+            labels
+        ) / ((labels > 0).sum() + N)
+
+        return retinanet_cls_loss, retinanet_regression_loss
+
+
+def make_retinanet_loss_evaluator(cfg, box_coder):
+    matcher = Matcher(
+        cfg.MODEL.RPN.FG_IOU_THRESHOLD,
+        cfg.MODEL.RPN.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=cfg.RETINANET.LOW_QUALITY_MATCHES,
+        low_quality_threshold=cfg.RETINANET.LOW_QUALITY_THRESHOLD
+    )
+
+    loss_evaluator = RetinaNetLossComputation(
+        cfg, matcher, box_coder
+    )
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/structures/bounding_box.py b/maskrcnn_benchmark/structures/bounding_box.py
index bcdd6d0b2..0da142fd2 100644
--- a/maskrcnn_benchmark/structures/bounding_box.py
+++ b/maskrcnn_benchmark/structures/bounding_box.py
@@ -10,7 +10,7 @@ class BoxList(object):
     """
     This class represents a set of bounding boxes.
     The bounding boxes are represented as a Nx4 Tensor.
-    In order ot uniquely determine the bounding boxes with respect
+    In order to uniquely determine the bounding boxes with respect
     to an image, we also store the corresponding image dimensions.
     They can contain extra information that is specific to each bounding box, such as
     labels.
@@ -224,9 +224,16 @@ def clip_to_image(self, remove_empty=True):
         return self
 
     def area(self):
-        TO_REMOVE = 1
-        box = self.bbox
-        area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE)
+        if self.mode == 'xyxy':
+            TO_REMOVE = 1
+            box = self.bbox
+            area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE)
+        elif self.mode == 'xywh':
+            box = self.bbox
+            area = box[:, 2] * box[:, 3]
+        else:
+            raise RuntimeError("Should not be here")
+            
         return area
 
     def copy_with_fields(self, fields):
diff --git a/run_test_R-50.sh b/run_test_R-50.sh
new file mode 100755
index 000000000..1604d80f4
--- /dev/null
+++ b/run_test_R-50.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+MODEL=$1
+
+for ITER in 0089999 0080001 0070001 0060001 0050001 0040001 0030001 0020001 0010001
+do
+	python tools/test_net.py --config-file ./configs/retina/${MODEL}.yaml MODEL.WEIGHT ./models/${MODEL}/model_${ITER}.pth OUTPUT_DIR ./models/${MODEL}/${ITER} TEST.IMS_PER_BATCH 4
+done
+
+#for ITER in 89999
+#do
+#	python tools/test_net.py --config-file ./configs/retina/retinanet_R-50-FPN_1x.yaml MODEL.WEIGHT ./models/retinanet_R-50-FPN_1x_1101/model_00${ITER}.pth OUTPUT_DIR ./models/retinanet_R-50-FPN_1x_1101/${ITER} TEST.IMS_PER_BATCH 1
+#done
diff --git a/run_test_R-50_dist.sh b/run_test_R-50_dist.sh
new file mode 100755
index 000000000..0df6ad58e
--- /dev/null
+++ b/run_test_R-50_dist.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+MODEL=$1
+
+for ITER in 0089999 0080001 0070001 0060001 0050001 0040001 0030001 0020001 0010001
+do
+	python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/test_net.py --config-file ./configs/retina/${MODEL}.yaml MODEL.WEIGHT ./models/${MODEL}/model_${ITER}.pth OUTPUT_DIR ./models/${MODEL}/${ITER} TEST.IMS_PER_BATCH 4
+done
+
+#for ITER in 89999
+#do
+#	python tools/test_net.py --config-file ./configs/retina/retinanet_R-50-FPN_1x.yaml MODEL.WEIGHT ./models/retinanet_R-50-FPN_1x_1101/model_00${ITER}.pth OUTPUT_DIR ./models/retinanet_R-50-FPN_1x_1101/${ITER} TEST.IMS_PER_BATCH 1
+#done
diff --git a/tools/parse_log.py b/tools/parse_log.py
new file mode 100644
index 000000000..a2f5db43c
--- /dev/null
+++ b/tools/parse_log.py
@@ -0,0 +1,72 @@
+import re
+import argparse
+import numpy as np
+
+def parse(log_path):
+    with open(log_path) as f:
+       text = f.read()
+
+    float_pattern = r'\d+\.\d+'
+    mean_pattern = r'AdjustSmoothL1\(mean\): ({}), ({}), ({}), ({})'.format(
+        float_pattern, float_pattern, float_pattern, float_pattern)
+    var_pattern = r'AdjustSmoothL1\(var\): ({}), ({}), ({}), ({})'.format(
+        float_pattern, float_pattern, float_pattern, float_pattern)
+    pattern = mean_pattern + r'.*\n.*' + var_pattern + r'.*\n.*' + \
+        r'iter: (\d+)  ' + \
+        r'loss: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'loss_retina_cls: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'loss_retina_reg: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'loss_mask: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'time: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'data: ({}) \(({})\)  '.format(float_pattern, float_pattern) + \
+        r'lr: ({})  '.format(float_pattern) + \
+        r'max mem: (\d+)'
+    reg_exp = re.compile(pattern)
+
+    headers = ['smooth_l1_mean', 'smooth_l1_var', 'iter', 'loss',
+               'loss_retina_cls', 'loss_retina_reg', 'loss_mask',
+               'time', 'data', 'lr', 'max_mem']
+
+    iterations = list()
+    means = list()
+    variations = list()
+    running_losses = list()
+    for args in reg_exp.findall(text):
+        mean = [float(v) for v in args[0:4]]
+        var = [float(v) for v in args[5:8]]
+        iteration = int(args[8])
+        point_loss = float(args[9])
+        running_loss = float(args[10])
+        point_loss_retina_cls = float(args[11])
+        running_loss_retina_cls = float(args[12])
+        point_loss_retina_reg = float(args[13])
+        running_loss_retina_reg = float(args[14])
+        point_loss_mask = float(args[15])
+        running_loss_mask = float(args[16])
+        point_time = float(args[17])
+        running_time = float(args[18])
+        point_data = float(args[19])
+        running_data = float(args[20])
+        lr = float(args[21])
+        max_mem = int(args[22])
+
+        iterations.append(iteration)
+        means.append(mean)
+        variations.append(var)
+        running_losses.append(running_loss)
+
+    iterations = np.asarray(iterations)
+    means = np.asarray(means)
+    variations = np.asarray(variations)
+    running_losses = np.asarray(running_losses)
+    print(iterations)
+    print(means)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Parse log file')
+    parser.add_argument('log_path', metavar='P', help='path to the log file')
+    args = parser.parse_args()
+
+    parse(args.log_path)
+
diff --git a/tools/test_net.py b/tools/test_net.py
index 0d23d8578..dda16fe35 100644
--- a/tools/test_net.py
+++ b/tools/test_net.py
@@ -79,7 +79,8 @@ def main():
             model,
             data_loader_val,
             iou_types=iou_types,
-            box_only=cfg.MODEL.RPN_ONLY,
+            #box_only=cfg.MODEL.RPN_ONLY,
+            box_only=False if cfg.RETINANET.RETINANET_ON else cfg.MODEL.RPN_ONLY,
             device=cfg.MODEL.DEVICE,
             expected_results=cfg.TEST.EXPECTED_RESULTS,
             expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
diff --git a/tools/train_net.py b/tools/train_net.py
index 1952d4739..1e6ce9a7e 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -96,7 +96,8 @@ def test(cfg, model, distributed):
             model,
             data_loader_val,
             iou_types=iou_types,
-            box_only=cfg.MODEL.RPN_ONLY,
+            #box_only=cfg.MODEL.RPN_ONLY,
+            box_only=False if cfg.RETINANET.RETINANET_ON else cfg.MODEL.RPN_ONLY,
             device=cfg.MODEL.DEVICE,
             expected_results=cfg.TEST.EXPECTED_RESULTS,
             expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,