diff --git a/detrex/data/__init__.py b/detrex/data/__init__.py
index ceda4f87..4add9476 100644
--- a/detrex/data/__init__.py
+++ b/detrex/data/__init__.py
@@ -14,3 +14,12 @@
 # limitations under the License.
 
 from .detr_dataset_mapper import DetrDatasetMapper
+from .dataset_mappers import (
+    COCOInstanceNewBaselineDatasetMapper,
+    COCOPanopticNewBaselineDatasetMapper,
+    MaskFormerSemanticDatasetMapper,
+    MaskFormerInstanceDatasetMapper,
+    MaskFormerPanopticDatasetMapper,
+)
+from . import datasets
+from .transforms import ColorAugSSDTransform
\ No newline at end of file
diff --git a/detrex/data/dataset_mappers/__init__.py b/detrex/data/dataset_mappers/__init__.py
new file mode 100644
index 00000000..8594a686
--- /dev/null
+++ b/detrex/data/dataset_mappers/__init__.py
@@ -0,0 +1,22 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .coco_instance_new_baseline_dataset_mapper import build_transform_gen as coco_instance_transform_gen
+from .coco_panoptic_new_baseline_dataset_mapper import build_transform_gen as coco_panoptic_transform_gen
+from .coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
+from .coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
+from .mask_former_instance_dataset_mapper import MaskFormerInstanceDatasetMapper
+from .mask_former_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapper
+from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
diff --git a/detrex/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py b/detrex/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
new file mode 100644
index 00000000..59383abd
--- /dev/null
+++ b/detrex/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# COCO Instance Segmentation with LSJ Augmentation
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+import copy
+import logging
+import numpy as np
+import torch
+
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+from pycocotools import mask as coco_mask
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+def build_transform_gen(
+    image_size,
+    min_scale,
+    max_scale,
+    random_flip: str = "horizontal",
+    is_train: bool = True,
+):
+    """
+    Create a list of default :class:`Augmentation`.
+    Now it includes resizing and flipping.
+    
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation."
+    assert random_flip in ["none", "horizontal", "vertical"], f"Only support none/horizontal/vertical flip, but got {random_flip}"
+
+    augmentation = []
+
+    if random_flip != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=random_flip == "horizontal",
+                vertical=random_flip == "vertical",
+            )
+        )
+    
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size,
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size))
+    ])
+
+    return augmentation
+
+
+class COCOInstanceNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentation,
+        image_format,
+    ):
+        self.augmentation = augmentation
+        logging.getLogger(__name__).info(
+            "[COCO_Instance_LSJ_Augment_Dataset_Mapper] Full TransformGens used in training: {}".format(str(self.augmentation))
+        )
+
+        self.img_format = image_format
+        self.is_train = is_train
+    
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        padding_mask = np.ones(image.shape[:2])
+        image, transforms = T.apply_transform_gens(self.augmentation, image)
+
+        padding_mask = transforms.apply_segmentation(padding_mask)
+        padding_mask = ~ padding_mask.astype(bool)
+
+        image_shape = image.shape[:2]
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
+        
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            for anno in dataset_dict["annotations"]:
+                anno.pop("keypoints", None)
+            
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            # NOTE: does not support BitMask due to augmentation
+            # Current BitMask cannot handle empty objects
+            instances = utils.annotations_to_instances(annos, image_shape)
+            # After transforms such as cropping are applied, the bounding box may no longer
+            # tightly bound the object. As an example, imagine a triangle object
+            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+            # the intersection of original bounding box and the cropping box.
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            # Need to filter empty instances first (due to augmentation)
+            instances = utils.filter_empty_instances(instances)
+            # Generate masks from polygon
+            h, w = instances.image_size
+            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
+            if hasattr(instances, 'gt_masks'):
+                gt_masks = instances.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                instances.gt_masks = gt_masks
+            # import ipdb; ipdb.set_trace()
+            dataset_dict["instances"] = instances
+
+        return dataset_dict
+
diff --git a/detrex/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py b/detrex/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
new file mode 100644
index 00000000..2a65972f
--- /dev/null
+++ b/detrex/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# COCO Panoptic Segmentation with LSJ Augmentation
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+import copy
+import logging
+
+import numpy as np
+import torch
+
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.transforms import TransformGen
+from detectron2.structures import BitMasks, Boxes, Instances
+
+__all__ = ["COCOPanopticNewBaselineDatasetMapper"]
+
+
+def build_transform_gen(
+    image_size,
+    min_scale,
+    max_scale,
+    random_flip: str = "horizontal", 
+    is_train: bool = True,
+):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+
+    augmentation = []
+
+    if random_flip != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=random_flip == "horizontal",
+                vertical=random_flip == "vertical",
+            )
+        )
+
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+
+    return augmentation
+
+
+# This is specifically designed for the COCO dataset.
+class COCOPanopticNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentation,
+        image_format,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            crop_gen: crop augmentation
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.augmentation = augmentation
+        logging.getLogger(__name__).info(
+            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
+                str(self.augmentation)
+            )
+        )
+
+        self.img_format = image_format
+        self.is_train = is_train
+
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        image, transforms = T.apply_transform_gens(self.augmentation, image)
+        image_shape = image.shape[:2]  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+
+            # apply the same transformation to panoptic segmentation
+            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+
+            from panopticapi.utils import rgb2id
+
+            pan_seg_gt = rgb2id(pan_seg_gt)
+
+            instances = Instances(image_shape)
+            classes = []
+            masks = []
+            for segment_info in segments_info:
+                class_id = segment_info["category_id"]
+                if not segment_info["iscrowd"]:
+                    classes.append(class_id)
+                    masks.append(pan_seg_gt == segment_info["id"])
+
+            classes = np.array(classes)
+            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+            if len(masks) == 0:
+                # Some image does not have annotation (all ignored)
+                instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
+            else:
+                masks = BitMasks(
+                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+                )
+                instances.gt_masks = masks.tensor
+                instances.gt_boxes = masks.get_bounding_boxes()
+
+            dataset_dict["instances"] = instances
+
+        return dataset_dict
+
diff --git a/detrex/data/dataset_mappers/mask_former_instance_dataset_mapper.py b/detrex/data/dataset_mappers/mask_former_instance_dataset_mapper.py
new file mode 100644
index 00000000..51c12ca0
--- /dev/null
+++ b/detrex/data/dataset_mappers/mask_former_instance_dataset_mapper.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# MaskFormer Instance Dataset Mapper
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+import copy
+import logging
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
+
+from detrex.data.transforms import ColorAugSSDTransform
+
+__all__ = ["MaskFormerInstanceDatasetMapper"]
+
+
+def build_transform_gen(
+    min_size_train,
+    max_size_train,
+    min_size_train_sampling,
+    enabled_crop: bool,
+    crop_type: str,
+    crop_size: str,
+    color_aug_ssd: bool,
+    img_format: str,
+    is_train: bool = True
+):
+    assert is_train, "Only support training augmentation."
+    
+    augmentations = []
+    augmentations.append(
+        [
+            T.ResizeShortestEdge(
+                min_size_train, 
+                max_size_train, 
+                min_size_train_sampling
+            )
+        ]
+    )
+    if enabled_crop:
+        augmentations.append(
+            T.RandomCrop(
+                crop_type=crop_type,
+                crop_size=crop_size,
+            )
+        )
+    if color_aug_ssd:
+        augmentations.append(ColorAugSSDTransform(img_format=img_format))
+    augmentations.append(T.RandomFlip())
+    return augmentations
+
+
+class MaskFormerInstanceDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for instance segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentations,
+        image_format,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.size_divisibility = size_divisibility
+
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
+
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        aug_input = T.AugInput(image)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+
+        # transform instnace masks
+        assert "annotations" in dataset_dict
+        for anno in dataset_dict["annotations"]:
+            anno.pop("keypoints", None)
+
+        annos = [
+            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+
+        if len(annos):
+            assert "segmentation" in annos[0]
+        segms = [obj["segmentation"] for obj in annos]
+        masks = []
+        for segm in segms:
+            if isinstance(segm, list):
+                # polygon
+                masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
+            elif isinstance(segm, dict):
+                # COCO RLE
+                masks.append(mask_util.decode(segm))
+            elif isinstance(segm, np.ndarray):
+                assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                    segm.ndim
+                )
+                # mask array
+                masks.append(segm)
+            else:
+                raise ValueError(
+                    "Cannot convert segmentation of type '{}' to BitMasks!"
+                    "Supported types are: polygons as list[list[float] or ndarray],"
+                    " COCO-style RLE as a dict, or a binary segmentation mask "
+                    " in a 2D numpy array of shape HxW.".format(type(segm))
+                )
+
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
+
+        classes = [int(obj["category_id"]) for obj in annos]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            # pad image
+            image = F.pad(image, padding_size, value=128).contiguous()
+            # pad mask
+            masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
+
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+
+        # Prepare per-category binary masks
+        instances = Instances(image_shape)
+        instances.gt_classes = classes
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
+        else:
+            masks = BitMasks(torch.stack(masks))
+            instances.gt_masks = masks.tensor
+
+        dataset_dict["instances"] = instances
+
+        return dataset_dict
\ No newline at end of file
diff --git a/detrex/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py b/detrex/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
new file mode 100644
index 00000000..af8d1788
--- /dev/null
+++ b/detrex/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# MaskFormer Instance Dataset Mapper
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+
+import copy
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances
+
+from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
+
+__all__ = ["MaskFormerPanopticDatasetMapper"]
+
+
+class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        dataset_names,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        super().__init__(
+            is_train,
+            augmentations=augmentations,
+            image_format=image_format,
+            dataset_names=dataset_names,
+            ignore_label=ignore_label,
+            size_divisibility=size_divisibility,
+        )
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
+
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        # semantic segmentation
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+        else:
+            sem_seg_gt = None
+
+        # panoptic segmentation
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+        else:
+            pan_seg_gt = None
+            segments_info = None
+
+        if pan_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        if sem_seg_gt is not None:
+            sem_seg_gt = aug_input.sem_seg
+
+        # apply the same transformation to panoptic segmentation
+        pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+
+        from panopticapi.utils import rgb2id
+
+        pan_seg_gt = rgb2id(pan_seg_gt)
+
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+        pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
+
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
+            pan_seg_gt = F.pad(
+                pan_seg_gt, padding_size, value=0
+            ).contiguous()  # 0 is the VOID panoptic label
+
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = sem_seg_gt.long()
+
+        if "annotations" in dataset_dict:
+            raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
+
+        # Prepare per-category binary masks
+        pan_seg_gt = pan_seg_gt.numpy()
+        instances = Instances(image_shape)
+        classes = []
+        masks = []
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                classes.append(class_id)
+                masks.append(pan_seg_gt == segment_info["id"])
+
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+
+        dataset_dict["instances"] = instances
+
+        return dataset_dict
\ No newline at end of file
diff --git a/detrex/data/dataset_mappers/mask_former_semantic_dataset_mapper.py b/detrex/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
new file mode 100644
index 00000000..ed76b6e3
--- /dev/null
+++ b/detrex/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# MaskFormer Instance Dataset Mapper
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+
+import copy
+import logging
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances
+
+from detrex.data.transforms import ColorAugSSDTransform
+
+__all__ = ["MaskFormerSemanticDatasetMapper"]
+
+
+def build_transform_gen(
+    min_size_train,
+    max_size_train,
+    min_size_train_sampling,
+    enabled_crop: bool,
+    crop_params: dict,
+    color_aug_ssd: bool,
+    img_format: str,
+    is_train: bool = True
+):
+    assert is_train, "Only support training augmentation."
+    
+    augmentations = []
+    augmentations.append(
+        [
+            T.ResizeShortestEdge(
+                min_size_train, 
+                max_size_train, 
+                min_size_train_sampling
+            )
+        ]
+    )
+    if enabled_crop:
+        augmentations.append(
+            T.RandomCrop_CategoryAreaConstraint(
+                **crop_params,
+            )
+        )
+    if color_aug_ssd:
+        augmentations.append(ColorAugSSDTransform(img_format=img_format))
+    augmentations.append(T.RandomFlip())
+    
+    return augmentations
+
+
+class MaskFormerSemanticDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for semantic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        dataset_names,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.ignore_label = ignore_label
+        self.size_divisibility = size_divisibility
+
+        dataset_names = dataset_names,
+        meta = MetadataCatalog.get(dataset_names[0]),
+        self.ignore_label = meta.ignore_label
+
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
+
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+        else:
+            sem_seg_gt = None
+
+        if sem_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        sem_seg_gt = aug_input.sem_seg
+
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
+
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = sem_seg_gt.long()
+
+        if "annotations" in dataset_dict:
+            raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
+
+        # Prepare per-category binary masks
+        if sem_seg_gt is not None:
+            sem_seg_gt = sem_seg_gt.numpy()
+            instances = Instances(image_shape)
+            classes = np.unique(sem_seg_gt)
+            # remove ignored region
+            classes = classes[classes != self.ignore_label]
+            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+
+            masks = []
+            for class_id in classes:
+                masks.append(sem_seg_gt == class_id)
+
+            if len(masks) == 0:
+                # Some image does not have annotation (all ignored)
+                instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
+            else:
+                masks = BitMasks(
+                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+                )
+                instances.gt_masks = masks.tensor
+
+            dataset_dict["instances"] = instances
+
+        return dataset_dict
\ No newline at end of file
diff --git a/detrex/data/datasets/__init__.py b/detrex/data/datasets/__init__.py
new file mode 100644
index 00000000..d5a53408
--- /dev/null
+++ b/detrex/data/datasets/__init__.py
@@ -0,0 +1,27 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+
+from . import (
+    register_ade20k_full,
+    register_ade20k_panoptic,
+    register_coco_stuff_10k,
+    register_mapillary_vistas,
+    register_coco_panoptic_annos_semseg,
+    register_ade20k_instance,
+    register_mapillary_vistas_panoptic,
+)
diff --git a/detrex/data/datasets/register_ade20k_full.py b/detrex/data/datasets/register_ade20k_full.py
new file mode 100644
index 00000000..5ae19e05
--- /dev/null
+++ b/detrex/data/datasets/register_ade20k_full.py
@@ -0,0 +1,984 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_full.py
+# ------------------------------------------------------------------------------------------------
+
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+ADE20K_SEM_SEG_FULL_CATEGORIES = [
+    {"name": "wall", "id": 2978, "trainId": 0},
+    {"name": "building, edifice", "id": 312, "trainId": 1},
+    {"name": "sky", "id": 2420, "trainId": 2},
+    {"name": "tree", "id": 2855, "trainId": 3},
+    {"name": "road, route", "id": 2131, "trainId": 4},
+    {"name": "floor, flooring", "id": 976, "trainId": 5},
+    {"name": "ceiling", "id": 447, "trainId": 6},
+    {"name": "bed", "id": 165, "trainId": 7},
+    {"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
+    {"name": "earth, ground", "id": 838, "trainId": 9},
+    {"name": "cabinet", "id": 350, "trainId": 10},
+    {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
+    {"name": "grass", "id": 1125, "trainId": 12},
+    {"name": "windowpane, window", "id": 3055, "trainId": 13},
+    {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
+    {"name": "mountain, mount", "id": 1610, "trainId": 15},
+    {"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
+    {"name": "table", "id": 2684, "trainId": 17},
+    {"name": "chair", "id": 471, "trainId": 18},
+    {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
+    {"name": "door", "id": 774, "trainId": 20},
+    {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
+    {"name": "sea", "id": 2264, "trainId": 22},
+    {"name": "painting, picture", "id": 1735, "trainId": 23},
+    {"name": "water", "id": 2994, "trainId": 24},
+    {"name": "mirror", "id": 1564, "trainId": 25},
+    {"name": "house", "id": 1276, "trainId": 26},
+    {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
+    {"name": "shelf", "id": 2329, "trainId": 28},
+    {"name": "armchair", "id": 57, "trainId": 29},
+    {"name": "fence, fencing", "id": 907, "trainId": 30},
+    {"name": "field", "id": 913, "trainId": 31},
+    {"name": "lamp", "id": 1395, "trainId": 32},
+    {"name": "rock, stone", "id": 2138, "trainId": 33},
+    {"name": "seat", "id": 2272, "trainId": 34},
+    {"name": "river", "id": 2128, "trainId": 35},
+    {"name": "desk", "id": 724, "trainId": 36},
+    {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
+    {"name": "railing, rail", "id": 2053, "trainId": 38},
+    {"name": "signboard, sign", "id": 2380, "trainId": 39},
+    {"name": "cushion", "id": 689, "trainId": 40},
+    {"name": "path", "id": 1788, "trainId": 41},
+    {"name": "work surface", "id": 3087, "trainId": 42},
+    {"name": "stairs, steps", "id": 2530, "trainId": 43},
+    {"name": "column, pillar", "id": 581, "trainId": 44},
+    {"name": "sink", "id": 2388, "trainId": 45},
+    {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
+    {"name": "snow", "id": 2454, "trainId": 47},
+    {"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
+    {"name": "base, pedestal, stand", "id": 137, "trainId": 49},
+    {"name": "bridge, span", "id": 294, "trainId": 50},
+    {"name": "blind, screen", "id": 212, "trainId": 51},
+    {"name": "runway", "id": 2185, "trainId": 52},
+    {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
+    {"name": "sand", "id": 2212, "trainId": 54},
+    {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
+    {"name": "pillow", "id": 1869, "trainId": 56},
+    {"name": "screen door, screen", "id": 2251, "trainId": 57},
+    {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
+    {"name": "skyscraper", "id": 2423, "trainId": 59},
+    {"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
+    {"name": "box", "id": 266, "trainId": 61},
+    {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
+    {"name": "palm, palm tree", "id": 1744, "trainId": 63},
+    {"name": "double door", "id": 783, "trainId": 64},
+    {"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
+    {"name": "counter", "id": 627, "trainId": 66},
+    {"name": "countertop", "id": 629, "trainId": 67},
+    {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
+    {"name": "kitchen island", "id": 1374, "trainId": 69},
+    {"name": "boat", "id": 223, "trainId": 70},
+    {"name": "waterfall, falls", "id": 3016, "trainId": 71},
+    {
+        "name": "stove, kitchen stove, range, kitchen range, cooking stove",
+        "id": 2598,
+        "trainId": 72,
+    },
+    {"name": "flower", "id": 978, "trainId": 73},
+    {"name": "bookcase", "id": 239, "trainId": 74},
+    {"name": "controls", "id": 608, "trainId": 75},
+    {"name": "book", "id": 236, "trainId": 76},
+    {"name": "stairway, staircase", "id": 2531, "trainId": 77},
+    {"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
+    {
+        "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
+        "id": 591,
+        "trainId": 79,
+    },
+    {
+        "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
+        "id": 327,
+        "trainId": 80,
+    },
+    {"name": "swivel chair", "id": 2679, "trainId": 81},
+    {"name": "light, light source", "id": 1451, "trainId": 82},
+    {"name": "bench", "id": 181, "trainId": 83},
+    {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
+    {"name": "towel", "id": 2821, "trainId": 85},
+    {"name": "fountain", "id": 1023, "trainId": 86},
+    {"name": "embankment", "id": 855, "trainId": 87},
+    {
+        "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
+        "id": 2733,
+        "trainId": 88,
+    },
+    {"name": "van", "id": 2928, "trainId": 89},
+    {"name": "hill", "id": 1240, "trainId": 90},
+    {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
+    {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
+    {"name": "truck, motortruck", "id": 2880, "trainId": 93},
+    {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
+    {"name": "pole", "id": 1936, "trainId": 95},
+    {"name": "tower", "id": 2828, "trainId": 96},
+    {"name": "court", "id": 631, "trainId": 97},
+    {"name": "ball", "id": 103, "trainId": 98},
+    {
+        "name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+        "id": 3144,
+        "trainId": 99,
+    },
+    {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
+    {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
+    {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
+    {"name": "minibike, motorbike", "id": 1563, "trainId": 103},
+    {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
+    {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
+    {"name": "step, stair", "id": 2569, "trainId": 106},
+    {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
+    {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
+    {"name": "doorframe, doorcase", "id": 778, "trainId": 109},
+    {"name": "sconce", "id": 2243, "trainId": 110},
+    {"name": "pond", "id": 1941, "trainId": 111},
+    {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
+    {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
+    {"name": "bag", "id": 95, "trainId": 114},
+    {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
+    {"name": "gazebo", "id": 1087, "trainId": 116},
+    {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
+    {"name": "land, ground, soil", "id": 1401, "trainId": 118},
+    {"name": "board, plank", "id": 220, "trainId": 119},
+    {"name": "arcade machine", "id": 47, "trainId": 120},
+    {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
+    {"name": "bar", "id": 123, "trainId": 122},
+    {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
+    {"name": "playground", "id": 1927, "trainId": 124},
+    {"name": "ship", "id": 2337, "trainId": 125},
+    {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
+    {
+        "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+        "id": 64,
+        "trainId": 127,
+    },
+    {"name": "bottle", "id": 249, "trainId": 128},
+    {"name": "cradle", "id": 642, "trainId": 129},
+    {"name": "pot, flowerpot", "id": 1981, "trainId": 130},
+    {
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+        "id": 609,
+        "trainId": 131,
+    },
+    {"name": "train, railroad train", "id": 2840, "trainId": 132},
+    {"name": "stool", "id": 2586, "trainId": 133},
+    {"name": "lake", "id": 1393, "trainId": 134},
+    {"name": "tank, storage tank", "id": 2704, "trainId": 135},
+    {"name": "ice, water ice", "id": 1304, "trainId": 136},
+    {"name": "basket, handbasket", "id": 146, "trainId": 137},
+    {"name": "manhole", "id": 1494, "trainId": 138},
+    {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
+    {"name": "canopy", "id": 389, "trainId": 140},
+    {"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
+    {"name": "barrel, cask", "id": 131, "trainId": 142},
+    {"name": "dirt track", "id": 738, "trainId": 143},
+    {"name": "beam", "id": 161, "trainId": 144},
+    {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
+    {"name": "plate", "id": 1919, "trainId": 146},
+    {"name": "screen, crt screen", "id": 3109, "trainId": 147},
+    {"name": "ruins", "id": 2179, "trainId": 148},
+    {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
+    {"name": "blanket, cover", "id": 206, "trainId": 150},
+    {"name": "plaything, toy", "id": 1930, "trainId": 151},
+    {"name": "food, solid food", "id": 1002, "trainId": 152},
+    {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
+    {"name": "oven", "id": 1708, "trainId": 154},
+    {"name": "stage", "id": 2526, "trainId": 155},
+    {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
+    {"name": "umbrella", "id": 2901, "trainId": 157},
+    {"name": "sculpture", "id": 2262, "trainId": 158},
+    {"name": "aqueduct", "id": 44, "trainId": 159},
+    {"name": "container", "id": 597, "trainId": 160},
+    {"name": "scaffolding, staging", "id": 2235, "trainId": 161},
+    {"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
+    {"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
+    {"name": "roller coaster", "id": 2151, "trainId": 164},
+    {"name": "horse, equus caballus", "id": 3107, "trainId": 165},
+    {"name": "catwalk", "id": 432, "trainId": 166},
+    {"name": "glass, drinking glass", "id": 1098, "trainId": 167},
+    {"name": "vase", "id": 2932, "trainId": 168},
+    {"name": "central reservation", "id": 461, "trainId": 169},
+    {"name": "carousel", "id": 410, "trainId": 170},
+    {"name": "radiator", "id": 2046, "trainId": 171},
+    {"name": "closet", "id": 533, "trainId": 172},
+    {"name": "machine", "id": 1481, "trainId": 173},
+    {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
+    {"name": "fan", "id": 894, "trainId": 175},
+    {"name": "inflatable bounce game", "id": 1322, "trainId": 176},
+    {"name": "pitch", "id": 1891, "trainId": 177},
+    {"name": "paper", "id": 1756, "trainId": 178},
+    {"name": "arcade, colonnade", "id": 49, "trainId": 179},
+    {"name": "hot tub", "id": 1272, "trainId": 180},
+    {"name": "helicopter", "id": 1229, "trainId": 181},
+    {"name": "tray", "id": 2850, "trainId": 182},
+    {"name": "partition, divider", "id": 1784, "trainId": 183},
+    {"name": "vineyard", "id": 2962, "trainId": 184},
+    {"name": "bowl", "id": 259, "trainId": 185},
+    {"name": "bullring", "id": 319, "trainId": 186},
+    {"name": "flag", "id": 954, "trainId": 187},
+    {"name": "pot", "id": 1974, "trainId": 188},
+    {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
+    {"name": "shower", "id": 2356, "trainId": 190},
+    {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
+    {"name": "bulletin board, notice board", "id": 318, "trainId": 192},
+    {"name": "confessional booth", "id": 592, "trainId": 193},
+    {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
+    {"name": "forest", "id": 1017, "trainId": 195},
+    {"name": "elevator door", "id": 851, "trainId": 196},
+    {"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
+    {"name": "instrument panel", "id": 1332, "trainId": 198},
+    {"name": "bucket, pail", "id": 303, "trainId": 199},
+    {"name": "tapestry, tapis", "id": 2714, "trainId": 200},
+    {"name": "platform", "id": 1924, "trainId": 201},
+    {"name": "jacket", "id": 1346, "trainId": 202},
+    {"name": "gate", "id": 1081, "trainId": 203},
+    {"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
+    {
+        "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
+        "id": 2727,
+        "trainId": 205,
+    },
+    {"name": "spotlight, spot", "id": 2509, "trainId": 206},
+    {"name": "ring", "id": 2123, "trainId": 207},
+    {"name": "control panel", "id": 602, "trainId": 208},
+    {"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
+    {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
+    {"name": "chest", "id": 490, "trainId": 211},
+    {"name": "clock", "id": 530, "trainId": 212},
+    {"name": "sand dune", "id": 2213, "trainId": 213},
+    {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
+    {"name": "vault", "id": 2934, "trainId": 215},
+    {"name": "table football", "id": 2687, "trainId": 216},
+    {"name": "cannon", "id": 387, "trainId": 217},
+    {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
+    {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
+    {"name": "statue", "id": 2547, "trainId": 220},
+    {
+        "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+        "id": 1474,
+        "trainId": 221,
+    },
+    {"name": "exhibitor", "id": 877, "trainId": 222},
+    {"name": "ladder", "id": 1391, "trainId": 223},
+    {"name": "carport", "id": 414, "trainId": 224},
+    {"name": "dam", "id": 698, "trainId": 225},
+    {"name": "pulpit", "id": 2019, "trainId": 226},
+    {"name": "skylight, fanlight", "id": 2422, "trainId": 227},
+    {"name": "water tower", "id": 3010, "trainId": 228},
+    {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
+    {"name": "display board", "id": 753, "trainId": 230},
+    {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
+    {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
+    {"name": "ice rink", "id": 1301, "trainId": 233},
+    {"name": "fruit", "id": 1033, "trainId": 234},
+    {"name": "patio", "id": 1789, "trainId": 235},
+    {"name": "vending machine", "id": 2939, "trainId": 236},
+    {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
+    {"name": "net", "id": 1652, "trainId": 238},
+    {
+        "name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+        "id": 90,
+        "trainId": 239,
+    },
+    {"name": "jar", "id": 1349, "trainId": 240},
+    {"name": "track", "id": 2830, "trainId": 241},
+    {"name": "magazine", "id": 1485, "trainId": 242},
+    {"name": "shutter", "id": 2370, "trainId": 243},
+    {"name": "roof", "id": 2155, "trainId": 244},
+    {"name": "banner, streamer", "id": 118, "trainId": 245},
+    {"name": "landfill", "id": 1402, "trainId": 246},
+    {"name": "post", "id": 1957, "trainId": 247},
+    {"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
+    {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
+    {"name": "arch, archway", "id": 52, "trainId": 250},
+    {"name": "table game", "id": 2688, "trainId": 251},
+    {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
+    {"name": "document, written document, papers", "id": 762, "trainId": 253},
+    {"name": "dome", "id": 772, "trainId": 254},
+    {"name": "pier", "id": 1857, "trainId": 255},
+    {"name": "shanties", "id": 2315, "trainId": 256},
+    {"name": "forecourt", "id": 1016, "trainId": 257},
+    {"name": "crane", "id": 643, "trainId": 258},
+    {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
+    {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
+    {"name": "drawing", "id": 791, "trainId": 261},
+    {"name": "cabin", "id": 349, "trainId": 262},
+    {
+        "name": "ad, advertisement, advertizement, advertising, advertizing, advert",
+        "id": 6,
+        "trainId": 263,
+    },
+    {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
+    {"name": "monument", "id": 1587, "trainId": 265},
+    {"name": "henhouse", "id": 1233, "trainId": 266},
+    {"name": "cockpit", "id": 559, "trainId": 267},
+    {"name": "heater, warmer", "id": 1223, "trainId": 268},
+    {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
+    {"name": "pool", "id": 1943, "trainId": 270},
+    {"name": "elevator, lift", "id": 853, "trainId": 271},
+    {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
+    {"name": "labyrinth", "id": 1390, "trainId": 273},
+    {"name": "text, textual matter", "id": 2748, "trainId": 274},
+    {"name": "printer", "id": 2007, "trainId": 275},
+    {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
+    {"name": "mattress", "id": 1513, "trainId": 277},
+    {"name": "straw", "id": 2600, "trainId": 278},
+    {"name": "stalls", "id": 2538, "trainId": 279},
+    {"name": "patio, terrace", "id": 1790, "trainId": 280},
+    {"name": "billboard, hoarding", "id": 194, "trainId": 281},
+    {"name": "bus stop", "id": 326, "trainId": 282},
+    {"name": "trouser, pant", "id": 2877, "trainId": 283},
+    {"name": "console table, console", "id": 594, "trainId": 284},
+    {"name": "rack", "id": 2036, "trainId": 285},
+    {"name": "notebook", "id": 1662, "trainId": 286},
+    {"name": "shrine", "id": 2366, "trainId": 287},
+    {"name": "pantry", "id": 1754, "trainId": 288},
+    {"name": "cart", "id": 418, "trainId": 289},
+    {"name": "steam shovel", "id": 2553, "trainId": 290},
+    {"name": "porch", "id": 1951, "trainId": 291},
+    {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
+    {"name": "figurine, statuette", "id": 918, "trainId": 293},
+    {"name": "recycling bin", "id": 2086, "trainId": 294},
+    {"name": "folding screen", "id": 997, "trainId": 295},
+    {"name": "telescope", "id": 2731, "trainId": 296},
+    {"name": "deck chair, beach chair", "id": 704, "trainId": 297},
+    {"name": "kennel", "id": 1365, "trainId": 298},
+    {"name": "coffee maker", "id": 569, "trainId": 299},
+    {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
+    {"name": "fish", "id": 948, "trainId": 301},
+    {"name": "easel", "id": 839, "trainId": 302},
+    {"name": "artificial golf green", "id": 63, "trainId": 303},
+    {"name": "iceberg", "id": 1305, "trainId": 304},
+    {"name": "candlestick, candle holder", "id": 378, "trainId": 305},
+    {"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
+    {"name": "television stand", "id": 2734, "trainId": 307},
+    {
+        "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
+        "id": 2982,
+        "trainId": 308,
+    },
+    {"name": "skeleton", "id": 2398, "trainId": 309},
+    {"name": "grand piano, grand", "id": 1119, "trainId": 310},
+    {"name": "candy, confect", "id": 382, "trainId": 311},
+    {"name": "grille door", "id": 1141, "trainId": 312},
+    {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
+    {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
+    {"name": "shoe", "id": 2341, "trainId": 315},
+    {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
+    {"name": "shanty", "id": 2316, "trainId": 317},
+    {"name": "structure", "id": 2626, "trainId": 318},
+    {"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
+    {"name": "bird", "id": 198, "trainId": 320},
+    {"name": "place mat", "id": 1896, "trainId": 321},
+    {"name": "tomb", "id": 2800, "trainId": 322},
+    {"name": "big top", "id": 190, "trainId": 323},
+    {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
+    {"name": "lockers", "id": 1463, "trainId": 325},
+    {"name": "cage", "id": 357, "trainId": 326},
+    {"name": "finger", "id": 929, "trainId": 327},
+    {"name": "bleachers", "id": 209, "trainId": 328},
+    {"name": "ferris wheel", "id": 912, "trainId": 329},
+    {"name": "hairdresser chair", "id": 1164, "trainId": 330},
+    {"name": "mat", "id": 1509, "trainId": 331},
+    {"name": "stands", "id": 2539, "trainId": 332},
+    {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
+    {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
+    {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
+    {"name": "dummy", "id": 818, "trainId": 336},
+    {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
+    {"name": "sand trap", "id": 2217, "trainId": 338},
+    {"name": "shop, store", "id": 2347, "trainId": 339},
+    {"name": "table cloth", "id": 2686, "trainId": 340},
+    {"name": "service station", "id": 2300, "trainId": 341},
+    {"name": "coffin", "id": 572, "trainId": 342},
+    {"name": "drawer", "id": 789, "trainId": 343},
+    {"name": "cages", "id": 358, "trainId": 344},
+    {"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
+    {"name": "balcony", "id": 101, "trainId": 346},
+    {"name": "volleyball court", "id": 2969, "trainId": 347},
+    {"name": "table tennis", "id": 2692, "trainId": 348},
+    {"name": "control table", "id": 606, "trainId": 349},
+    {"name": "shirt", "id": 2339, "trainId": 350},
+    {"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
+    {"name": "railway", "id": 2060, "trainId": 352},
+    {"name": "parterre", "id": 1782, "trainId": 353},
+    {"name": "chimney", "id": 495, "trainId": 354},
+    {"name": "can, tin, tin can", "id": 371, "trainId": 355},
+    {"name": "tanks", "id": 2707, "trainId": 356},
+    {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
+    {"name": "alga, algae", "id": 3156, "trainId": 358},
+    {"name": "system", "id": 2683, "trainId": 359},
+    {"name": "map", "id": 1499, "trainId": 360},
+    {"name": "greenhouse", "id": 1135, "trainId": 361},
+    {"name": "mug", "id": 1619, "trainId": 362},
+    {"name": "barbecue", "id": 125, "trainId": 363},
+    {"name": "trailer", "id": 2838, "trainId": 364},
+    {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
+    {"name": "organ", "id": 1695, "trainId": 366},
+    {"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
+    {"name": "island", "id": 1343, "trainId": 368},
+    {"name": "keyboard", "id": 1370, "trainId": 369},
+    {"name": "trench", "id": 2858, "trainId": 370},
+    {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
+    {"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
+    {"name": "pitcher, ewer", "id": 1892, "trainId": 373},
+    {"name": "goal", "id": 1103, "trainId": 374},
+    {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
+    {"name": "beds", "id": 170, "trainId": 376},
+    {"name": "wood", "id": 3073, "trainId": 377},
+    {"name": "file cabinet", "id": 922, "trainId": 378},
+    {"name": "newspaper, paper", "id": 1655, "trainId": 379},
+    {"name": "motorboat", "id": 1602, "trainId": 380},
+    {"name": "rope", "id": 2160, "trainId": 381},
+    {"name": "guitar", "id": 1151, "trainId": 382},
+    {"name": "rubble", "id": 2176, "trainId": 383},
+    {"name": "scarf", "id": 2239, "trainId": 384},
+    {"name": "barrels", "id": 132, "trainId": 385},
+    {"name": "cap", "id": 394, "trainId": 386},
+    {"name": "leaves", "id": 1424, "trainId": 387},
+    {"name": "control tower", "id": 607, "trainId": 388},
+    {"name": "dashboard", "id": 700, "trainId": 389},
+    {"name": "bandstand", "id": 116, "trainId": 390},
+    {"name": "lectern", "id": 1425, "trainId": 391},
+    {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
+    {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
+    {"name": "shower room", "id": 2360, "trainId": 394},
+    {"name": "smoke", "id": 2449, "trainId": 395},
+    {"name": "faucet, spigot", "id": 897, "trainId": 396},
+    {"name": "bulldozer", "id": 317, "trainId": 397},
+    {"name": "saucepan", "id": 2228, "trainId": 398},
+    {"name": "shops", "id": 2351, "trainId": 399},
+    {"name": "meter", "id": 1543, "trainId": 400},
+    {"name": "crevasse", "id": 656, "trainId": 401},
+    {"name": "gear", "id": 1088, "trainId": 402},
+    {"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
+    {"name": "sofa bed", "id": 2472, "trainId": 404},
+    {"name": "tunnel", "id": 2892, "trainId": 405},
+    {"name": "pallet", "id": 1740, "trainId": 406},
+    {"name": "wire, conducting wire", "id": 3067, "trainId": 407},
+    {"name": "kettle, boiler", "id": 1367, "trainId": 408},
+    {"name": "bidet", "id": 188, "trainId": 409},
+    {
+        "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
+        "id": 79,
+        "trainId": 410,
+    },
+    {"name": "music stand", "id": 1633, "trainId": 411},
+    {"name": "pipe, tube", "id": 1885, "trainId": 412},
+    {"name": "cup", "id": 677, "trainId": 413},
+    {"name": "parking meter", "id": 1779, "trainId": 414},
+    {"name": "ice hockey rink", "id": 1297, "trainId": 415},
+    {"name": "shelter", "id": 2334, "trainId": 416},
+    {"name": "weeds", "id": 3027, "trainId": 417},
+    {"name": "temple", "id": 2735, "trainId": 418},
+    {"name": "patty, cake", "id": 1791, "trainId": 419},
+    {"name": "ski slope", "id": 2405, "trainId": 420},
+    {"name": "panel", "id": 1748, "trainId": 421},
+    {"name": "wallet", "id": 2983, "trainId": 422},
+    {"name": "wheel", "id": 3035, "trainId": 423},
+    {"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
+    {"name": "roundabout", "id": 2168, "trainId": 425},
+    {"name": "canister, cannister, tin", "id": 385, "trainId": 426},
+    {"name": "rod", "id": 2148, "trainId": 427},
+    {"name": "soap dispenser", "id": 2465, "trainId": 428},
+    {"name": "bell", "id": 175, "trainId": 429},
+    {"name": "canvas", "id": 390, "trainId": 430},
+    {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
+    {"name": "teacup", "id": 2722, "trainId": 432},
+    {"name": "trellis", "id": 2857, "trainId": 433},
+    {"name": "workbench", "id": 3088, "trainId": 434},
+    {"name": "valley, vale", "id": 2926, "trainId": 435},
+    {"name": "toaster", "id": 2782, "trainId": 436},
+    {"name": "knife", "id": 1378, "trainId": 437},
+    {"name": "podium", "id": 1934, "trainId": 438},
+    {"name": "ramp", "id": 2072, "trainId": 439},
+    {"name": "tumble dryer", "id": 2889, "trainId": 440},
+    {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
+    {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
+    {"name": "lab bench", "id": 1383, "trainId": 443},
+    {"name": "equipment", "id": 867, "trainId": 444},
+    {"name": "rocky formation", "id": 2145, "trainId": 445},
+    {"name": "plastic", "id": 1915, "trainId": 446},
+    {"name": "calendar", "id": 361, "trainId": 447},
+    {"name": "caravan", "id": 402, "trainId": 448},
+    {"name": "check-in-desk", "id": 482, "trainId": 449},
+    {"name": "ticket counter", "id": 2761, "trainId": 450},
+    {"name": "brush", "id": 300, "trainId": 451},
+    {"name": "mill", "id": 1554, "trainId": 452},
+    {"name": "covered bridge", "id": 636, "trainId": 453},
+    {"name": "bowling alley", "id": 260, "trainId": 454},
+    {"name": "hanger", "id": 1186, "trainId": 455},
+    {"name": "excavator", "id": 871, "trainId": 456},
+    {"name": "trestle", "id": 2859, "trainId": 457},
+    {"name": "revolving door", "id": 2103, "trainId": 458},
+    {"name": "blast furnace", "id": 208, "trainId": 459},
+    {"name": "scale, weighing machine", "id": 2236, "trainId": 460},
+    {"name": "projector", "id": 2012, "trainId": 461},
+    {"name": "soap", "id": 2462, "trainId": 462},
+    {"name": "locker", "id": 1462, "trainId": 463},
+    {"name": "tractor", "id": 2832, "trainId": 464},
+    {"name": "stretcher", "id": 2617, "trainId": 465},
+    {"name": "frame", "id": 1024, "trainId": 466},
+    {"name": "grating", "id": 1129, "trainId": 467},
+    {"name": "alembic", "id": 18, "trainId": 468},
+    {"name": "candle, taper, wax light", "id": 376, "trainId": 469},
+    {"name": "barrier", "id": 134, "trainId": 470},
+    {"name": "cardboard", "id": 407, "trainId": 471},
+    {"name": "cave", "id": 434, "trainId": 472},
+    {"name": "puddle", "id": 2017, "trainId": 473},
+    {"name": "tarp", "id": 2717, "trainId": 474},
+    {"name": "price tag", "id": 2005, "trainId": 475},
+    {"name": "watchtower", "id": 2993, "trainId": 476},
+    {"name": "meters", "id": 1545, "trainId": 477},
+    {
+        "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
+        "id": 1445,
+        "trainId": 478,
+    },
+    {"name": "tracks", "id": 2831, "trainId": 479},
+    {"name": "hair dryer", "id": 1161, "trainId": 480},
+    {"name": "skirt", "id": 2411, "trainId": 481},
+    {"name": "viaduct", "id": 2949, "trainId": 482},
+    {"name": "paper towel", "id": 1769, "trainId": 483},
+    {"name": "coat", "id": 552, "trainId": 484},
+    {"name": "sheet", "id": 2327, "trainId": 485},
+    {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
+    {"name": "water wheel", "id": 3013, "trainId": 487},
+    {"name": "pottery, clayware", "id": 1986, "trainId": 488},
+    {"name": "magazine rack", "id": 1486, "trainId": 489},
+    {"name": "teapot", "id": 2723, "trainId": 490},
+    {"name": "microphone, mike", "id": 1549, "trainId": 491},
+    {"name": "support", "id": 2649, "trainId": 492},
+    {"name": "forklift", "id": 1020, "trainId": 493},
+    {"name": "canyon", "id": 392, "trainId": 494},
+    {"name": "cash register, register", "id": 422, "trainId": 495},
+    {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
+    {"name": "remote control, remote", "id": 2099, "trainId": 497},
+    {"name": "soap dish", "id": 2464, "trainId": 498},
+    {"name": "windshield, windscreen", "id": 3058, "trainId": 499},
+    {"name": "cat", "id": 430, "trainId": 500},
+    {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
+    {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
+    {"name": "videos", "id": 2955, "trainId": 503},
+    {"name": "shovel", "id": 2355, "trainId": 504},
+    {"name": "eaves", "id": 840, "trainId": 505},
+    {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
+    {"name": "shipyard", "id": 2338, "trainId": 507},
+    {"name": "hen, biddy", "id": 1232, "trainId": 508},
+    {"name": "traffic cone", "id": 2834, "trainId": 509},
+    {"name": "washing machines", "id": 2991, "trainId": 510},
+    {"name": "truck crane", "id": 2879, "trainId": 511},
+    {"name": "cds", "id": 444, "trainId": 512},
+    {"name": "niche", "id": 1657, "trainId": 513},
+    {"name": "scoreboard", "id": 2246, "trainId": 514},
+    {"name": "briefcase", "id": 296, "trainId": 515},
+    {"name": "boot", "id": 245, "trainId": 516},
+    {"name": "sweater, jumper", "id": 2661, "trainId": 517},
+    {"name": "hay", "id": 1202, "trainId": 518},
+    {"name": "pack", "id": 1714, "trainId": 519},
+    {"name": "bottle rack", "id": 251, "trainId": 520},
+    {"name": "glacier", "id": 1095, "trainId": 521},
+    {"name": "pergola", "id": 1828, "trainId": 522},
+    {"name": "building materials", "id": 311, "trainId": 523},
+    {"name": "television camera", "id": 2732, "trainId": 524},
+    {"name": "first floor", "id": 947, "trainId": 525},
+    {"name": "rifle", "id": 2115, "trainId": 526},
+    {"name": "tennis table", "id": 2738, "trainId": 527},
+    {"name": "stadium", "id": 2525, "trainId": 528},
+    {"name": "safety belt", "id": 2194, "trainId": 529},
+    {"name": "cover", "id": 634, "trainId": 530},
+    {"name": "dish rack", "id": 740, "trainId": 531},
+    {"name": "synthesizer", "id": 2682, "trainId": 532},
+    {"name": "pumpkin", "id": 2020, "trainId": 533},
+    {"name": "gutter", "id": 1156, "trainId": 534},
+    {"name": "fruit stand", "id": 1036, "trainId": 535},
+    {"name": "ice floe, floe", "id": 1295, "trainId": 536},
+    {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
+    {"name": "wheelchair", "id": 3037, "trainId": 538},
+    {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
+    {"name": "diploma", "id": 736, "trainId": 540},
+    {"name": "fairground ride", "id": 893, "trainId": 541},
+    {"name": "radio", "id": 2047, "trainId": 542},
+    {"name": "hotplate", "id": 1274, "trainId": 543},
+    {"name": "junk", "id": 1361, "trainId": 544},
+    {"name": "wheelbarrow", "id": 3036, "trainId": 545},
+    {"name": "stream", "id": 2606, "trainId": 546},
+    {"name": "toll plaza", "id": 2797, "trainId": 547},
+    {"name": "punching bag", "id": 2022, "trainId": 548},
+    {"name": "trough", "id": 2876, "trainId": 549},
+    {"name": "throne", "id": 2758, "trainId": 550},
+    {"name": "chair desk", "id": 472, "trainId": 551},
+    {"name": "weighbridge", "id": 3028, "trainId": 552},
+    {"name": "extractor fan", "id": 882, "trainId": 553},
+    {"name": "hanging clothes", "id": 1189, "trainId": 554},
+    {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
+    {"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
+    {"name": "ski lift", "id": 2401, "trainId": 557},
+    {"name": "chain", "id": 468, "trainId": 558},
+    {"name": "garage", "id": 1061, "trainId": 559},
+    {"name": "mechanical shovel", "id": 1523, "trainId": 560},
+    {"name": "wine rack", "id": 3059, "trainId": 561},
+    {"name": "tramway", "id": 2843, "trainId": 562},
+    {"name": "treadmill", "id": 2853, "trainId": 563},
+    {"name": "menu", "id": 1529, "trainId": 564},
+    {"name": "block", "id": 214, "trainId": 565},
+    {"name": "well", "id": 3032, "trainId": 566},
+    {"name": "witness stand", "id": 3071, "trainId": 567},
+    {"name": "branch", "id": 277, "trainId": 568},
+    {"name": "duck", "id": 813, "trainId": 569},
+    {"name": "casserole", "id": 426, "trainId": 570},
+    {"name": "frying pan", "id": 1039, "trainId": 571},
+    {"name": "desk organizer", "id": 727, "trainId": 572},
+    {"name": "mast", "id": 1508, "trainId": 573},
+    {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
+    {"name": "service elevator", "id": 2299, "trainId": 575},
+    {"name": "dollhouse", "id": 768, "trainId": 576},
+    {"name": "hammock", "id": 1172, "trainId": 577},
+    {"name": "clothes hanging", "id": 537, "trainId": 578},
+    {"name": "photocopier", "id": 1847, "trainId": 579},
+    {"name": "notepad", "id": 1664, "trainId": 580},
+    {"name": "golf cart", "id": 1110, "trainId": 581},
+    {"name": "footpath", "id": 1014, "trainId": 582},
+    {"name": "cross", "id": 662, "trainId": 583},
+    {"name": "baptismal font", "id": 121, "trainId": 584},
+    {"name": "boiler", "id": 227, "trainId": 585},
+    {"name": "skip", "id": 2410, "trainId": 586},
+    {"name": "rotisserie", "id": 2165, "trainId": 587},
+    {"name": "tables", "id": 2696, "trainId": 588},
+    {"name": "water mill", "id": 3005, "trainId": 589},
+    {"name": "helmet", "id": 1231, "trainId": 590},
+    {"name": "cover curtain", "id": 635, "trainId": 591},
+    {"name": "brick", "id": 292, "trainId": 592},
+    {"name": "table runner", "id": 2690, "trainId": 593},
+    {"name": "ashtray", "id": 65, "trainId": 594},
+    {"name": "street box", "id": 2607, "trainId": 595},
+    {"name": "stick", "id": 2574, "trainId": 596},
+    {"name": "hangers", "id": 1188, "trainId": 597},
+    {"name": "cells", "id": 456, "trainId": 598},
+    {"name": "urinal", "id": 2913, "trainId": 599},
+    {"name": "centerpiece", "id": 459, "trainId": 600},
+    {"name": "portable fridge", "id": 1955, "trainId": 601},
+    {"name": "dvds", "id": 827, "trainId": 602},
+    {"name": "golf club", "id": 1111, "trainId": 603},
+    {"name": "skirting board", "id": 2412, "trainId": 604},
+    {"name": "water cooler", "id": 2997, "trainId": 605},
+    {"name": "clipboard", "id": 528, "trainId": 606},
+    {"name": "camera, photographic camera", "id": 366, "trainId": 607},
+    {"name": "pigeonhole", "id": 1863, "trainId": 608},
+    {"name": "chips", "id": 500, "trainId": 609},
+    {"name": "food processor", "id": 1001, "trainId": 610},
+    {"name": "post box", "id": 1958, "trainId": 611},
+    {"name": "lid", "id": 1441, "trainId": 612},
+    {"name": "drum", "id": 809, "trainId": 613},
+    {"name": "blender", "id": 210, "trainId": 614},
+    {"name": "cave entrance", "id": 435, "trainId": 615},
+    {"name": "dental chair", "id": 718, "trainId": 616},
+    {"name": "obelisk", "id": 1674, "trainId": 617},
+    {"name": "canoe", "id": 388, "trainId": 618},
+    {"name": "mobile", "id": 1572, "trainId": 619},
+    {"name": "monitors", "id": 1584, "trainId": 620},
+    {"name": "pool ball", "id": 1944, "trainId": 621},
+    {"name": "cue rack", "id": 674, "trainId": 622},
+    {"name": "baggage carts", "id": 99, "trainId": 623},
+    {"name": "shore", "id": 2352, "trainId": 624},
+    {"name": "fork", "id": 1019, "trainId": 625},
+    {"name": "paper filer", "id": 1763, "trainId": 626},
+    {"name": "bicycle rack", "id": 185, "trainId": 627},
+    {"name": "coat rack", "id": 554, "trainId": 628},
+    {"name": "garland", "id": 1066, "trainId": 629},
+    {"name": "sports bag", "id": 2508, "trainId": 630},
+    {"name": "fish tank", "id": 951, "trainId": 631},
+    {"name": "towel dispenser", "id": 2822, "trainId": 632},
+    {"name": "carriage", "id": 415, "trainId": 633},
+    {"name": "brochure", "id": 297, "trainId": 634},
+    {"name": "plaque", "id": 1914, "trainId": 635},
+    {"name": "stringer", "id": 2619, "trainId": 636},
+    {"name": "iron", "id": 1338, "trainId": 637},
+    {"name": "spoon", "id": 2505, "trainId": 638},
+    {"name": "flag pole", "id": 955, "trainId": 639},
+    {"name": "toilet brush", "id": 2786, "trainId": 640},
+    {"name": "book stand", "id": 238, "trainId": 641},
+    {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
+    {"name": "ticket office", "id": 2763, "trainId": 643},
+    {"name": "broom", "id": 299, "trainId": 644},
+    {"name": "dvd", "id": 822, "trainId": 645},
+    {"name": "ice bucket", "id": 1288, "trainId": 646},
+    {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
+    {"name": "tureen", "id": 2894, "trainId": 648},
+    {"name": "folders", "id": 992, "trainId": 649},
+    {"name": "chess", "id": 489, "trainId": 650},
+    {"name": "root", "id": 2157, "trainId": 651},
+    {"name": "sewing machine", "id": 2309, "trainId": 652},
+    {"name": "model", "id": 1576, "trainId": 653},
+    {"name": "pen", "id": 1810, "trainId": 654},
+    {"name": "violin", "id": 2964, "trainId": 655},
+    {"name": "sweatshirt", "id": 2662, "trainId": 656},
+    {"name": "recycling materials", "id": 2087, "trainId": 657},
+    {"name": "mitten", "id": 1569, "trainId": 658},
+    {"name": "chopping board, cutting board", "id": 503, "trainId": 659},
+    {"name": "mask", "id": 1505, "trainId": 660},
+    {"name": "log", "id": 1468, "trainId": 661},
+    {"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
+    {"name": "grill", "id": 1138, "trainId": 663},
+    {"name": "hole", "id": 1256, "trainId": 664},
+    {"name": "target", "id": 2715, "trainId": 665},
+    {"name": "trash bag", "id": 2846, "trainId": 666},
+    {"name": "chalk", "id": 477, "trainId": 667},
+    {"name": "sticks", "id": 2576, "trainId": 668},
+    {"name": "balloon", "id": 108, "trainId": 669},
+    {"name": "score", "id": 2245, "trainId": 670},
+    {"name": "hair spray", "id": 1162, "trainId": 671},
+    {"name": "roll", "id": 2149, "trainId": 672},
+    {"name": "runner", "id": 2183, "trainId": 673},
+    {"name": "engine", "id": 858, "trainId": 674},
+    {"name": "inflatable glove", "id": 1324, "trainId": 675},
+    {"name": "games", "id": 1055, "trainId": 676},
+    {"name": "pallets", "id": 1741, "trainId": 677},
+    {"name": "baskets", "id": 149, "trainId": 678},
+    {"name": "coop", "id": 615, "trainId": 679},
+    {"name": "dvd player", "id": 825, "trainId": 680},
+    {"name": "rocking horse", "id": 2143, "trainId": 681},
+    {"name": "buckets", "id": 304, "trainId": 682},
+    {"name": "bread rolls", "id": 283, "trainId": 683},
+    {"name": "shawl", "id": 2322, "trainId": 684},
+    {"name": "watering can", "id": 3017, "trainId": 685},
+    {"name": "spotlights", "id": 2510, "trainId": 686},
+    {"name": "post-it", "id": 1960, "trainId": 687},
+    {"name": "bowls", "id": 265, "trainId": 688},
+    {"name": "security camera", "id": 2282, "trainId": 689},
+    {"name": "runner cloth", "id": 2184, "trainId": 690},
+    {"name": "lock", "id": 1461, "trainId": 691},
+    {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
+    {"name": "side", "id": 2372, "trainId": 693},
+    {"name": "roulette", "id": 2166, "trainId": 694},
+    {"name": "bone", "id": 232, "trainId": 695},
+    {"name": "cutlery", "id": 693, "trainId": 696},
+    {"name": "pool balls", "id": 1945, "trainId": 697},
+    {"name": "wheels", "id": 3039, "trainId": 698},
+    {"name": "spice rack", "id": 2494, "trainId": 699},
+    {"name": "plant pots", "id": 1908, "trainId": 700},
+    {"name": "towel ring", "id": 2827, "trainId": 701},
+    {"name": "bread box", "id": 280, "trainId": 702},
+    {"name": "video", "id": 2950, "trainId": 703},
+    {"name": "funfair", "id": 1044, "trainId": 704},
+    {"name": "breads", "id": 288, "trainId": 705},
+    {"name": "tripod", "id": 2863, "trainId": 706},
+    {"name": "ironing board", "id": 1342, "trainId": 707},
+    {"name": "skimmer", "id": 2409, "trainId": 708},
+    {"name": "hollow", "id": 1258, "trainId": 709},
+    {"name": "scratching post", "id": 2249, "trainId": 710},
+    {"name": "tricycle", "id": 2862, "trainId": 711},
+    {"name": "file box", "id": 920, "trainId": 712},
+    {"name": "mountain pass", "id": 1607, "trainId": 713},
+    {"name": "tombstones", "id": 2802, "trainId": 714},
+    {"name": "cooker", "id": 610, "trainId": 715},
+    {"name": "card game, cards", "id": 3129, "trainId": 716},
+    {"name": "golf bag", "id": 1108, "trainId": 717},
+    {"name": "towel paper", "id": 2823, "trainId": 718},
+    {"name": "chaise lounge", "id": 476, "trainId": 719},
+    {"name": "sun", "id": 2641, "trainId": 720},
+    {"name": "toilet paper holder", "id": 2788, "trainId": 721},
+    {"name": "rake", "id": 2070, "trainId": 722},
+    {"name": "key", "id": 1368, "trainId": 723},
+    {"name": "umbrella stand", "id": 2903, "trainId": 724},
+    {"name": "dartboard", "id": 699, "trainId": 725},
+    {"name": "transformer", "id": 2844, "trainId": 726},
+    {"name": "fireplace utensils", "id": 942, "trainId": 727},
+    {"name": "sweatshirts", "id": 2663, "trainId": 728},
+    {
+        "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+        "id": 457,
+        "trainId": 729,
+    },
+    {"name": "tallboy", "id": 2701, "trainId": 730},
+    {"name": "stapler", "id": 2540, "trainId": 731},
+    {"name": "sauna", "id": 2231, "trainId": 732},
+    {"name": "test tube", "id": 2746, "trainId": 733},
+    {"name": "palette", "id": 1738, "trainId": 734},
+    {"name": "shopping carts", "id": 2350, "trainId": 735},
+    {"name": "tools", "id": 2808, "trainId": 736},
+    {"name": "push button, push, button", "id": 2025, "trainId": 737},
+    {"name": "star", "id": 2541, "trainId": 738},
+    {"name": "roof rack", "id": 2156, "trainId": 739},
+    {"name": "barbed wire", "id": 126, "trainId": 740},
+    {"name": "spray", "id": 2512, "trainId": 741},
+    {"name": "ear", "id": 831, "trainId": 742},
+    {"name": "sponge", "id": 2503, "trainId": 743},
+    {"name": "racket", "id": 2039, "trainId": 744},
+    {"name": "tins", "id": 2774, "trainId": 745},
+    {"name": "eyeglasses", "id": 886, "trainId": 746},
+    {"name": "file", "id": 919, "trainId": 747},
+    {"name": "scarfs", "id": 2240, "trainId": 748},
+    {"name": "sugar bowl", "id": 2636, "trainId": 749},
+    {"name": "flip flop", "id": 963, "trainId": 750},
+    {"name": "headstones", "id": 1218, "trainId": 751},
+    {"name": "laptop bag", "id": 1406, "trainId": 752},
+    {"name": "leash", "id": 1420, "trainId": 753},
+    {"name": "climbing frame", "id": 526, "trainId": 754},
+    {"name": "suit hanger", "id": 2639, "trainId": 755},
+    {"name": "floor spotlight", "id": 975, "trainId": 756},
+    {"name": "plate rack", "id": 1921, "trainId": 757},
+    {"name": "sewer", "id": 2305, "trainId": 758},
+    {"name": "hard drive", "id": 1193, "trainId": 759},
+    {"name": "sprinkler", "id": 2517, "trainId": 760},
+    {"name": "tools box", "id": 2809, "trainId": 761},
+    {"name": "necklace", "id": 1647, "trainId": 762},
+    {"name": "bulbs", "id": 314, "trainId": 763},
+    {"name": "steel industry", "id": 2560, "trainId": 764},
+    {"name": "club", "id": 545, "trainId": 765},
+    {"name": "jack", "id": 1345, "trainId": 766},
+    {"name": "door bars", "id": 775, "trainId": 767},
+    {
+        "name": "control panel, instrument panel, control board, board, panel",
+        "id": 603,
+        "trainId": 768,
+    },
+    {"name": "hairbrush", "id": 1163, "trainId": 769},
+    {"name": "napkin holder", "id": 1641, "trainId": 770},
+    {"name": "office", "id": 1678, "trainId": 771},
+    {"name": "smoke detector", "id": 2450, "trainId": 772},
+    {"name": "utensils", "id": 2915, "trainId": 773},
+    {"name": "apron", "id": 42, "trainId": 774},
+    {"name": "scissors", "id": 2242, "trainId": 775},
+    {"name": "terminal", "id": 2741, "trainId": 776},
+    {"name": "grinder", "id": 1143, "trainId": 777},
+    {"name": "entry phone", "id": 862, "trainId": 778},
+    {"name": "newspaper stand", "id": 1654, "trainId": 779},
+    {"name": "pepper shaker", "id": 1826, "trainId": 780},
+    {"name": "onions", "id": 1689, "trainId": 781},
+    {
+        "name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
+        "id": 3124,
+        "trainId": 782,
+    },
+    {"name": "tape", "id": 2710, "trainId": 783},
+    {"name": "bat", "id": 152, "trainId": 784},
+    {"name": "coaster", "id": 549, "trainId": 785},
+    {"name": "calculator", "id": 360, "trainId": 786},
+    {"name": "potatoes", "id": 1982, "trainId": 787},
+    {"name": "luggage rack", "id": 1478, "trainId": 788},
+    {"name": "salt", "id": 2203, "trainId": 789},
+    {"name": "street number", "id": 2612, "trainId": 790},
+    {"name": "viewpoint", "id": 2956, "trainId": 791},
+    {"name": "sword", "id": 2681, "trainId": 792},
+    {"name": "cd", "id": 437, "trainId": 793},
+    {"name": "rowing machine", "id": 2171, "trainId": 794},
+    {"name": "plug", "id": 1933, "trainId": 795},
+    {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
+    {"name": "pepper", "id": 1824, "trainId": 797},
+    {"name": "tongs", "id": 2803, "trainId": 798},
+    {"name": "bonfire", "id": 234, "trainId": 799},
+    {"name": "dog dish", "id": 764, "trainId": 800},
+    {"name": "belt", "id": 177, "trainId": 801},
+    {"name": "dumbbells", "id": 817, "trainId": 802},
+    {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
+    {"name": "hook", "id": 1262, "trainId": 804},
+    {"name": "envelopes", "id": 864, "trainId": 805},
+    {"name": "shower faucet", "id": 2359, "trainId": 806},
+    {"name": "watch", "id": 2992, "trainId": 807},
+    {"name": "padlock", "id": 1725, "trainId": 808},
+    {"name": "swimming pool ladder", "id": 2667, "trainId": 809},
+    {"name": "spanners", "id": 2484, "trainId": 810},
+    {"name": "gravy boat", "id": 1133, "trainId": 811},
+    {"name": "notice board", "id": 1667, "trainId": 812},
+    {"name": "trash bags", "id": 2847, "trainId": 813},
+    {"name": "fire alarm", "id": 932, "trainId": 814},
+    {"name": "ladle", "id": 1392, "trainId": 815},
+    {"name": "stethoscope", "id": 2573, "trainId": 816},
+    {"name": "rocket", "id": 2140, "trainId": 817},
+    {"name": "funnel", "id": 1046, "trainId": 818},
+    {"name": "bowling pins", "id": 264, "trainId": 819},
+    {"name": "valve", "id": 2927, "trainId": 820},
+    {"name": "thermometer", "id": 2752, "trainId": 821},
+    {"name": "cups", "id": 679, "trainId": 822},
+    {"name": "spice jar", "id": 2493, "trainId": 823},
+    {"name": "night light", "id": 1658, "trainId": 824},
+    {"name": "soaps", "id": 2466, "trainId": 825},
+    {"name": "games table", "id": 1057, "trainId": 826},
+    {"name": "slotted spoon", "id": 2444, "trainId": 827},
+    {"name": "reel", "id": 2093, "trainId": 828},
+    {"name": "scourer", "id": 2248, "trainId": 829},
+    {"name": "sleeping robe", "id": 2432, "trainId": 830},
+    {"name": "desk mat", "id": 726, "trainId": 831},
+    {"name": "dumbbell", "id": 816, "trainId": 832},
+    {"name": "hammer", "id": 1171, "trainId": 833},
+    {"name": "tie", "id": 2766, "trainId": 834},
+    {"name": "typewriter", "id": 2900, "trainId": 835},
+    {"name": "shaker", "id": 2313, "trainId": 836},
+    {"name": "cheese dish", "id": 488, "trainId": 837},
+    {"name": "sea star", "id": 2265, "trainId": 838},
+    {"name": "racquet", "id": 2043, "trainId": 839},
+    {"name": "butane gas cylinder", "id": 332, "trainId": 840},
+    {"name": "paper weight", "id": 1771, "trainId": 841},
+    {"name": "shaving brush", "id": 2320, "trainId": 842},
+    {"name": "sunglasses", "id": 2646, "trainId": 843},
+    {"name": "gear shift", "id": 1089, "trainId": 844},
+    {"name": "towel rail", "id": 2826, "trainId": 845},
+    {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
+]
+
+
+def _get_ade20k_full_meta():
+    # Id 0 is reserved for ignore_label, we change ignore_label for 0
+    # to 255 in our pre-processing, so all ids are shifted by 1.
+    stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
+    assert len(stuff_ids) == 847, len(stuff_ids)
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 91], used in models) to ids in the dataset (used for processing results)
+    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
+    stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
+
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+    }
+    return ret
+
+
+def register_all_ade20k_full(root):
+    root = os.path.join(root, "ADE20K_2021_17_01")
+    meta = _get_ade20k_full_meta()
+    for name, dirname in [("train", "training"), ("val", "validation")]:
+        image_dir = os.path.join(root, "images_detectron2", dirname)
+        gt_dir = os.path.join(root, "annotations_detectron2", dirname)
+        name = f"ade20k_full_sem_seg_{name}"
+        DatasetCatalog.register(
+            name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="tif", image_ext="jpg")
+        )
+        MetadataCatalog.get(name).set(
+            stuff_classes=meta["stuff_classes"][:],
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=65535,  # NOTE: gt is saved in 16-bit TIFF images
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_full(_root)
diff --git a/detrex/data/datasets/register_ade20k_instance.py b/detrex/data/datasets/register_ade20k_instance.py
new file mode 100644
index 00000000..0d8a124d
--- /dev/null
+++ b/detrex/data/datasets/register_ade20k_instance.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py
+# ------------------------------------------------------------------------------------------------
+
+import json
+import logging
+import numpy as np
+import os
+from PIL import Image
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
+from detectron2.utils.file_io import PathManager
+
+ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
+
+
+_PREDEFINED_SPLITS = {
+    # point annotations without masks
+    "ade20k_instance_train": (
+        "ADEChallengeData2016/images/training",
+        "ADEChallengeData2016/ade20k_instance_train.json",
+    ),
+    "ade20k_instance_val": (
+        "ADEChallengeData2016/images/validation",
+        "ADEChallengeData2016/ade20k_instance_val.json",
+    ),
+}
+
+
+def _get_ade_instances_meta():
+    thing_ids = [k["id"] for k in ADE_CATEGORIES]
+    assert len(thing_ids) == 100, len(thing_ids)
+    # Mapping from the incontiguous ADE category id to an id in [0, 99]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in ADE_CATEGORIES]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+    }
+    return ret
+
+
+def register_all_ade20k_instance(root):
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_coco_instances(
+            key,
+            _get_ade_instances_meta(),
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_instance(_root)
diff --git a/detrex/data/datasets/register_ade20k_panoptic.py b/detrex/data/datasets/register_ade20k_panoptic.py
new file mode 100644
index 00000000..da7b2e25
--- /dev/null
+++ b/detrex/data/datasets/register_ade20k_panoptic.py
@@ -0,0 +1,410 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_panoptic.py
+# ------------------------------------------------------------------------------------------------
+
+import json
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.file_io import PathManager
+
+ADE20K_150_CATEGORIES = [
+    {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
+    {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
+    {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
+    {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
+    {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
+    {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
+    {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
+    {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
+    {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
+    {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
+    {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
+    {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
+    {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
+    {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
+    {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
+    {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
+    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
+    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
+    {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
+    {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
+    {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
+    {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
+    {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
+    {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
+    {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
+    {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
+    {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
+    {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
+    {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
+    {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
+    {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
+    {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
+    {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
+    {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
+    {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
+    {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
+    {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
+    {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
+    {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
+    {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
+    {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
+    {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
+    {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
+    {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
+    {
+        "color": [6, 51, 255],
+        "id": 44,
+        "isthing": 1,
+        "name": "chest of drawers, chest, bureau, dresser",
+    },
+    {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
+    {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
+    {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
+    {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
+    {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
+    {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
+    {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
+    {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
+    {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
+    {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
+    {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
+    {
+        "color": [255, 71, 0],
+        "id": 56,
+        "isthing": 1,
+        "name": "pool table, billiard table, snooker table",
+    },
+    {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
+    {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
+    {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
+    {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
+    {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
+    {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
+    {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
+    {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
+    {
+        "color": [0, 255, 133],
+        "id": 65,
+        "isthing": 1,
+        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
+    },
+    {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
+    {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
+    {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
+    {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
+    {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
+    {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
+    {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
+    {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
+    {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
+    {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
+    {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
+    {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
+    {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
+    {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
+    {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
+    {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
+    {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
+    {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
+    {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
+    {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
+    {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
+    {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
+    {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
+    {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
+    {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
+    {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
+    {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
+    {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
+    {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
+    {
+        "color": [0, 122, 255],
+        "id": 95,
+        "isthing": 1,
+        "name": "bannister, banister, balustrade, balusters, handrail",
+    },
+    {
+        "color": [0, 255, 163],
+        "id": 96,
+        "isthing": 0,
+        "name": "escalator, moving staircase, moving stairway",
+    },
+    {
+        "color": [255, 153, 0],
+        "id": 97,
+        "isthing": 1,
+        "name": "ottoman, pouf, pouffe, puff, hassock",
+    },
+    {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
+    {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
+    {
+        "color": [143, 255, 0],
+        "id": 100,
+        "isthing": 0,
+        "name": "poster, posting, placard, notice, bill, card",
+    },
+    {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
+    {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
+    {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
+    {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
+    {
+        "color": [133, 0, 255],
+        "id": 105,
+        "isthing": 0,
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+    },
+    {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
+    {
+        "color": [184, 0, 255],
+        "id": 107,
+        "isthing": 1,
+        "name": "washer, automatic washer, washing machine",
+    },
+    {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
+    {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
+    {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
+    {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
+    {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
+    {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
+    {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
+    {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
+    {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
+    {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
+    {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
+    {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
+    {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
+    {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
+    {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
+    {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
+    {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
+    {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
+    {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
+    {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
+    {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
+    {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
+    {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
+    {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
+    {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
+    {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
+    {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
+    {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
+    {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
+    {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
+    {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
+    {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
+    {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
+    {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
+    {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
+    {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
+    {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
+    {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
+    {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
+    {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
+    {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
+    {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
+]
+
+ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]
+
+MetadataCatalog.get("ade20k_sem_seg_train").set(
+    stuff_colors=ADE20k_COLORS[:],
+)
+
+MetadataCatalog.get("ade20k_sem_seg_val").set(
+    stuff_colors=ADE20k_COLORS[:],
+)
+
+
+def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = ann["image_id"]
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+
+
+def register_ade20k_panoptic(
+    name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
+):
+    """
+    Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
+    The dictionaries in this registered dataset follows detectron2's standard format.
+    Hence it's called "standard".
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "ade20k_panoptic_train"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images in COCO format
+        panoptic_json (str): path to the json panoptic annotation file in COCO format
+        sem_seg_root (none): not used, to be consistent with
+            `register_coco_panoptic_separated`.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: load_ade20k_panoptic_json(
+            panoptic_json, image_root, panoptic_root, semantic_root, metadata
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="ade20k_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+
+
+_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
+    "ade20k_panoptic_train": (
+        "ADEChallengeData2016/images/training",
+        "ADEChallengeData2016/ade20k_panoptic_train",
+        "ADEChallengeData2016/ade20k_panoptic_train.json",
+        "ADEChallengeData2016/annotations_detectron2/training",
+        "ADEChallengeData2016/ade20k_instance_train.json",
+    ),
+    "ade20k_panoptic_val": (
+        "ADEChallengeData2016/images/validation",
+        "ADEChallengeData2016/ade20k_panoptic_val",
+        "ADEChallengeData2016/ade20k_panoptic_val.json",
+        "ADEChallengeData2016/annotations_detectron2/validation",
+        "ADEChallengeData2016/ade20k_instance_val.json",
+    ),
+}
+
+
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES]
+    stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES]
+
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+
+    for i, cat in enumerate(ADE20K_150_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+    return meta
+
+
+def register_all_ade20k_panoptic(root):
+    metadata = get_metadata()
+    for (
+        prefix,
+        (image_root, panoptic_root, panoptic_json, semantic_root, instance_json),
+    ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
+        # The "standard" version of COCO panoptic segmentation dataset,
+        # e.g. used by Panoptic-DeepLab
+        register_ade20k_panoptic(
+            prefix,
+            metadata,
+            os.path.join(root, image_root),
+            os.path.join(root, panoptic_root),
+            os.path.join(root, semantic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, instance_json),
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_panoptic(_root)
diff --git a/detrex/data/datasets/register_coco_panoptic_annos_semseg.py b/detrex/data/datasets/register_coco_panoptic_annos_semseg.py
new file mode 100644
index 00000000..2f6eaf01
--- /dev/null
+++ b/detrex/data/datasets/register_coco_panoptic_annos_semseg.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
+# ------------------------------------------------------------------------------------------------
+
+import json
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.utils.file_io import PathManager
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_semseg_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_semseg_val2017",
+    ),
+}
+
+
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+    stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+
+    for i, cat in enumerate(COCO_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+    return meta
+
+
+def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = int(ann["image_id"])
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+
+
+def register_coco_panoptic_annos_sem_seg(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    panoptic_name = name
+    delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
+    delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
+    MetadataCatalog.get(panoptic_name).set(
+        thing_classes=metadata["thing_classes"],
+        thing_colors=metadata["thing_colors"],
+        # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
+    )
+
+    # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
+    semantic_name = name + "_with_sem_seg"
+    DatasetCatalog.register(
+        semantic_name,
+        lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
+    )
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root,
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+
+
+def register_all_coco_panoptic_annos_sem_seg(root):
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+
+        register_coco_panoptic_annos_sem_seg(
+            prefix,
+            get_metadata(),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_panoptic_annos_sem_seg(_root)
diff --git a/detrex/data/datasets/register_coco_stuff_10k.py b/detrex/data/datasets/register_coco_stuff_10k.py
new file mode 100644
index 00000000..c08f066d
--- /dev/null
+++ b/detrex/data/datasets/register_coco_stuff_10k.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_stuff_10k.py
+# ------------------------------------------------------------------------------------------------
+
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"id": 92, "name": "banner", "supercategory": "textile"},
+    {"id": 93, "name": "blanket", "supercategory": "textile"},
+    {"id": 94, "name": "branch", "supercategory": "plant"},
+    {"id": 95, "name": "bridge", "supercategory": "building"},
+    {"id": 96, "name": "building-other", "supercategory": "building"},
+    {"id": 97, "name": "bush", "supercategory": "plant"},
+    {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
+    {"id": 99, "name": "cage", "supercategory": "structural"},
+    {"id": 100, "name": "cardboard", "supercategory": "raw-material"},
+    {"id": 101, "name": "carpet", "supercategory": "floor"},
+    {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
+    {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
+    {"id": 104, "name": "cloth", "supercategory": "textile"},
+    {"id": 105, "name": "clothes", "supercategory": "textile"},
+    {"id": 106, "name": "clouds", "supercategory": "sky"},
+    {"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
+    {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
+    {"id": 109, "name": "curtain", "supercategory": "textile"},
+    {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
+    {"id": 111, "name": "dirt", "supercategory": "ground"},
+    {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
+    {"id": 113, "name": "fence", "supercategory": "structural"},
+    {"id": 114, "name": "floor-marble", "supercategory": "floor"},
+    {"id": 115, "name": "floor-other", "supercategory": "floor"},
+    {"id": 116, "name": "floor-stone", "supercategory": "floor"},
+    {"id": 117, "name": "floor-tile", "supercategory": "floor"},
+    {"id": 118, "name": "floor-wood", "supercategory": "floor"},
+    {"id": 119, "name": "flower", "supercategory": "plant"},
+    {"id": 120, "name": "fog", "supercategory": "water"},
+    {"id": 121, "name": "food-other", "supercategory": "food-stuff"},
+    {"id": 122, "name": "fruit", "supercategory": "food-stuff"},
+    {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
+    {"id": 124, "name": "grass", "supercategory": "plant"},
+    {"id": 125, "name": "gravel", "supercategory": "ground"},
+    {"id": 126, "name": "ground-other", "supercategory": "ground"},
+    {"id": 127, "name": "hill", "supercategory": "solid"},
+    {"id": 128, "name": "house", "supercategory": "building"},
+    {"id": 129, "name": "leaves", "supercategory": "plant"},
+    {"id": 130, "name": "light", "supercategory": "furniture-stuff"},
+    {"id": 131, "name": "mat", "supercategory": "textile"},
+    {"id": 132, "name": "metal", "supercategory": "raw-material"},
+    {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
+    {"id": 134, "name": "moss", "supercategory": "plant"},
+    {"id": 135, "name": "mountain", "supercategory": "solid"},
+    {"id": 136, "name": "mud", "supercategory": "ground"},
+    {"id": 137, "name": "napkin", "supercategory": "textile"},
+    {"id": 138, "name": "net", "supercategory": "structural"},
+    {"id": 139, "name": "paper", "supercategory": "raw-material"},
+    {"id": 140, "name": "pavement", "supercategory": "ground"},
+    {"id": 141, "name": "pillow", "supercategory": "textile"},
+    {"id": 142, "name": "plant-other", "supercategory": "plant"},
+    {"id": 143, "name": "plastic", "supercategory": "raw-material"},
+    {"id": 144, "name": "platform", "supercategory": "ground"},
+    {"id": 145, "name": "playingfield", "supercategory": "ground"},
+    {"id": 146, "name": "railing", "supercategory": "structural"},
+    {"id": 147, "name": "railroad", "supercategory": "ground"},
+    {"id": 148, "name": "river", "supercategory": "water"},
+    {"id": 149, "name": "road", "supercategory": "ground"},
+    {"id": 150, "name": "rock", "supercategory": "solid"},
+    {"id": 151, "name": "roof", "supercategory": "building"},
+    {"id": 152, "name": "rug", "supercategory": "textile"},
+    {"id": 153, "name": "salad", "supercategory": "food-stuff"},
+    {"id": 154, "name": "sand", "supercategory": "ground"},
+    {"id": 155, "name": "sea", "supercategory": "water"},
+    {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
+    {"id": 157, "name": "sky-other", "supercategory": "sky"},
+    {"id": 158, "name": "skyscraper", "supercategory": "building"},
+    {"id": 159, "name": "snow", "supercategory": "ground"},
+    {"id": 160, "name": "solid-other", "supercategory": "solid"},
+    {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
+    {"id": 162, "name": "stone", "supercategory": "solid"},
+    {"id": 163, "name": "straw", "supercategory": "plant"},
+    {"id": 164, "name": "structural-other", "supercategory": "structural"},
+    {"id": 165, "name": "table", "supercategory": "furniture-stuff"},
+    {"id": 166, "name": "tent", "supercategory": "building"},
+    {"id": 167, "name": "textile-other", "supercategory": "textile"},
+    {"id": 168, "name": "towel", "supercategory": "textile"},
+    {"id": 169, "name": "tree", "supercategory": "plant"},
+    {"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
+    {"id": 171, "name": "wall-brick", "supercategory": "wall"},
+    {"id": 172, "name": "wall-concrete", "supercategory": "wall"},
+    {"id": 173, "name": "wall-other", "supercategory": "wall"},
+    {"id": 174, "name": "wall-panel", "supercategory": "wall"},
+    {"id": 175, "name": "wall-stone", "supercategory": "wall"},
+    {"id": 176, "name": "wall-tile", "supercategory": "wall"},
+    {"id": 177, "name": "wall-wood", "supercategory": "wall"},
+    {"id": 178, "name": "water-other", "supercategory": "water"},
+    {"id": 179, "name": "waterdrops", "supercategory": "water"},
+    {"id": 180, "name": "window-blind", "supercategory": "window"},
+    {"id": 181, "name": "window-other", "supercategory": "window"},
+    {"id": 182, "name": "wood", "supercategory": "solid"},
+]
+
+
+def _get_coco_stuff_meta():
+    # Id 0 is reserved for ignore_label, we change ignore_label for 0
+    # to 255 in our pre-processing.
+    stuff_ids = [k["id"] for k in COCO_CATEGORIES]
+    assert len(stuff_ids) == 171, len(stuff_ids)
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 91], used in models) to ids in the dataset (used for processing results)
+    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
+    stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+    }
+    return ret
+
+
+def register_all_coco_stuff_10k(root):
+    root = os.path.join(root, "coco", "coco_stuff_10k")
+    meta = _get_coco_stuff_meta()
+    for name, image_dirname, sem_seg_dirname in [
+        ("train", "images_detectron2/train", "annotations_detectron2/train"),
+        ("test", "images_detectron2/test", "annotations_detectron2/test"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        name = f"coco_2017_{name}_stuff_10k_sem_seg"
+        DatasetCatalog.register(
+            name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
+        )
+        MetadataCatalog.get(name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_stuff_10k(_root)
diff --git a/detrex/data/datasets/register_mapillary_vistas.py b/detrex/data/datasets/register_mapillary_vistas.py
new file mode 100644
index 00000000..354e2b75
--- /dev/null
+++ b/detrex/data/datasets/register_mapillary_vistas.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_mapillary_vistas_panoptic.py
+# ------------------------------------------------------------------------------------------------
+
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
+    {
+        "color": [165, 42, 42],
+        "instances": True,
+        "readable": "Bird",
+        "name": "animal--bird",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 192, 0],
+        "instances": True,
+        "readable": "Ground Animal",
+        "name": "animal--ground-animal",
+        "evaluate": True,
+    },
+    {
+        "color": [196, 196, 196],
+        "instances": False,
+        "readable": "Curb",
+        "name": "construction--barrier--curb",
+        "evaluate": True,
+    },
+    {
+        "color": [190, 153, 153],
+        "instances": False,
+        "readable": "Fence",
+        "name": "construction--barrier--fence",
+        "evaluate": True,
+    },
+    {
+        "color": [180, 165, 180],
+        "instances": False,
+        "readable": "Guard Rail",
+        "name": "construction--barrier--guard-rail",
+        "evaluate": True,
+    },
+    {
+        "color": [90, 120, 150],
+        "instances": False,
+        "readable": "Barrier",
+        "name": "construction--barrier--other-barrier",
+        "evaluate": True,
+    },
+    {
+        "color": [102, 102, 156],
+        "instances": False,
+        "readable": "Wall",
+        "name": "construction--barrier--wall",
+        "evaluate": True,
+    },
+    {
+        "color": [128, 64, 255],
+        "instances": False,
+        "readable": "Bike Lane",
+        "name": "construction--flat--bike-lane",
+        "evaluate": True,
+    },
+    {
+        "color": [140, 140, 200],
+        "instances": True,
+        "readable": "Crosswalk - Plain",
+        "name": "construction--flat--crosswalk-plain",
+        "evaluate": True,
+    },
+    {
+        "color": [170, 170, 170],
+        "instances": False,
+        "readable": "Curb Cut",
+        "name": "construction--flat--curb-cut",
+        "evaluate": True,
+    },
+    {
+        "color": [250, 170, 160],
+        "instances": False,
+        "readable": "Parking",
+        "name": "construction--flat--parking",
+        "evaluate": True,
+    },
+    {
+        "color": [96, 96, 96],
+        "instances": False,
+        "readable": "Pedestrian Area",
+        "name": "construction--flat--pedestrian-area",
+        "evaluate": True,
+    },
+    {
+        "color": [230, 150, 140],
+        "instances": False,
+        "readable": "Rail Track",
+        "name": "construction--flat--rail-track",
+        "evaluate": True,
+    },
+    {
+        "color": [128, 64, 128],
+        "instances": False,
+        "readable": "Road",
+        "name": "construction--flat--road",
+        "evaluate": True,
+    },
+    {
+        "color": [110, 110, 110],
+        "instances": False,
+        "readable": "Service Lane",
+        "name": "construction--flat--service-lane",
+        "evaluate": True,
+    },
+    {
+        "color": [244, 35, 232],
+        "instances": False,
+        "readable": "Sidewalk",
+        "name": "construction--flat--sidewalk",
+        "evaluate": True,
+    },
+    {
+        "color": [150, 100, 100],
+        "instances": False,
+        "readable": "Bridge",
+        "name": "construction--structure--bridge",
+        "evaluate": True,
+    },
+    {
+        "color": [70, 70, 70],
+        "instances": False,
+        "readable": "Building",
+        "name": "construction--structure--building",
+        "evaluate": True,
+    },
+    {
+        "color": [150, 120, 90],
+        "instances": False,
+        "readable": "Tunnel",
+        "name": "construction--structure--tunnel",
+        "evaluate": True,
+    },
+    {
+        "color": [220, 20, 60],
+        "instances": True,
+        "readable": "Person",
+        "name": "human--person",
+        "evaluate": True,
+    },
+    {
+        "color": [255, 0, 0],
+        "instances": True,
+        "readable": "Bicyclist",
+        "name": "human--rider--bicyclist",
+        "evaluate": True,
+    },
+    {
+        "color": [255, 0, 100],
+        "instances": True,
+        "readable": "Motorcyclist",
+        "name": "human--rider--motorcyclist",
+        "evaluate": True,
+    },
+    {
+        "color": [255, 0, 200],
+        "instances": True,
+        "readable": "Other Rider",
+        "name": "human--rider--other-rider",
+        "evaluate": True,
+    },
+    {
+        "color": [200, 128, 128],
+        "instances": True,
+        "readable": "Lane Marking - Crosswalk",
+        "name": "marking--crosswalk-zebra",
+        "evaluate": True,
+    },
+    {
+        "color": [255, 255, 255],
+        "instances": False,
+        "readable": "Lane Marking - General",
+        "name": "marking--general",
+        "evaluate": True,
+    },
+    {
+        "color": [64, 170, 64],
+        "instances": False,
+        "readable": "Mountain",
+        "name": "nature--mountain",
+        "evaluate": True,
+    },
+    {
+        "color": [230, 160, 50],
+        "instances": False,
+        "readable": "Sand",
+        "name": "nature--sand",
+        "evaluate": True,
+    },
+    {
+        "color": [70, 130, 180],
+        "instances": False,
+        "readable": "Sky",
+        "name": "nature--sky",
+        "evaluate": True,
+    },
+    {
+        "color": [190, 255, 255],
+        "instances": False,
+        "readable": "Snow",
+        "name": "nature--snow",
+        "evaluate": True,
+    },
+    {
+        "color": [152, 251, 152],
+        "instances": False,
+        "readable": "Terrain",
+        "name": "nature--terrain",
+        "evaluate": True,
+    },
+    {
+        "color": [107, 142, 35],
+        "instances": False,
+        "readable": "Vegetation",
+        "name": "nature--vegetation",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 170, 30],
+        "instances": False,
+        "readable": "Water",
+        "name": "nature--water",
+        "evaluate": True,
+    },
+    {
+        "color": [255, 255, 128],
+        "instances": True,
+        "readable": "Banner",
+        "name": "object--banner",
+        "evaluate": True,
+    },
+    {
+        "color": [250, 0, 30],
+        "instances": True,
+        "readable": "Bench",
+        "name": "object--bench",
+        "evaluate": True,
+    },
+    {
+        "color": [100, 140, 180],
+        "instances": True,
+        "readable": "Bike Rack",
+        "name": "object--bike-rack",
+        "evaluate": True,
+    },
+    {
+        "color": [220, 220, 220],
+        "instances": True,
+        "readable": "Billboard",
+        "name": "object--billboard",
+        "evaluate": True,
+    },
+    {
+        "color": [220, 128, 128],
+        "instances": True,
+        "readable": "Catch Basin",
+        "name": "object--catch-basin",
+        "evaluate": True,
+    },
+    {
+        "color": [222, 40, 40],
+        "instances": True,
+        "readable": "CCTV Camera",
+        "name": "object--cctv-camera",
+        "evaluate": True,
+    },
+    {
+        "color": [100, 170, 30],
+        "instances": True,
+        "readable": "Fire Hydrant",
+        "name": "object--fire-hydrant",
+        "evaluate": True,
+    },
+    {
+        "color": [40, 40, 40],
+        "instances": True,
+        "readable": "Junction Box",
+        "name": "object--junction-box",
+        "evaluate": True,
+    },
+    {
+        "color": [33, 33, 33],
+        "instances": True,
+        "readable": "Mailbox",
+        "name": "object--mailbox",
+        "evaluate": True,
+    },
+    {
+        "color": [100, 128, 160],
+        "instances": True,
+        "readable": "Manhole",
+        "name": "object--manhole",
+        "evaluate": True,
+    },
+    {
+        "color": [142, 0, 0],
+        "instances": True,
+        "readable": "Phone Booth",
+        "name": "object--phone-booth",
+        "evaluate": True,
+    },
+    {
+        "color": [70, 100, 150],
+        "instances": False,
+        "readable": "Pothole",
+        "name": "object--pothole",
+        "evaluate": True,
+    },
+    {
+        "color": [210, 170, 100],
+        "instances": True,
+        "readable": "Street Light",
+        "name": "object--street-light",
+        "evaluate": True,
+    },
+    {
+        "color": [153, 153, 153],
+        "instances": True,
+        "readable": "Pole",
+        "name": "object--support--pole",
+        "evaluate": True,
+    },
+    {
+        "color": [128, 128, 128],
+        "instances": True,
+        "readable": "Traffic Sign Frame",
+        "name": "object--support--traffic-sign-frame",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 80],
+        "instances": True,
+        "readable": "Utility Pole",
+        "name": "object--support--utility-pole",
+        "evaluate": True,
+    },
+    {
+        "color": [250, 170, 30],
+        "instances": True,
+        "readable": "Traffic Light",
+        "name": "object--traffic-light",
+        "evaluate": True,
+    },
+    {
+        "color": [192, 192, 192],
+        "instances": True,
+        "readable": "Traffic Sign (Back)",
+        "name": "object--traffic-sign--back",
+        "evaluate": True,
+    },
+    {
+        "color": [220, 220, 0],
+        "instances": True,
+        "readable": "Traffic Sign (Front)",
+        "name": "object--traffic-sign--front",
+        "evaluate": True,
+    },
+    {
+        "color": [140, 140, 20],
+        "instances": True,
+        "readable": "Trash Can",
+        "name": "object--trash-can",
+        "evaluate": True,
+    },
+    {
+        "color": [119, 11, 32],
+        "instances": True,
+        "readable": "Bicycle",
+        "name": "object--vehicle--bicycle",
+        "evaluate": True,
+    },
+    {
+        "color": [150, 0, 255],
+        "instances": True,
+        "readable": "Boat",
+        "name": "object--vehicle--boat",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 60, 100],
+        "instances": True,
+        "readable": "Bus",
+        "name": "object--vehicle--bus",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 142],
+        "instances": True,
+        "readable": "Car",
+        "name": "object--vehicle--car",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 90],
+        "instances": True,
+        "readable": "Caravan",
+        "name": "object--vehicle--caravan",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 230],
+        "instances": True,
+        "readable": "Motorcycle",
+        "name": "object--vehicle--motorcycle",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 80, 100],
+        "instances": False,
+        "readable": "On Rails",
+        "name": "object--vehicle--on-rails",
+        "evaluate": True,
+    },
+    {
+        "color": [128, 64, 64],
+        "instances": True,
+        "readable": "Other Vehicle",
+        "name": "object--vehicle--other-vehicle",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 110],
+        "instances": True,
+        "readable": "Trailer",
+        "name": "object--vehicle--trailer",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 70],
+        "instances": True,
+        "readable": "Truck",
+        "name": "object--vehicle--truck",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 192],
+        "instances": True,
+        "readable": "Wheeled Slow",
+        "name": "object--vehicle--wheeled-slow",
+        "evaluate": True,
+    },
+    {
+        "color": [32, 32, 32],
+        "instances": False,
+        "readable": "Car Mount",
+        "name": "void--car-mount",
+        "evaluate": True,
+    },
+    {
+        "color": [120, 10, 10],
+        "instances": False,
+        "readable": "Ego Vehicle",
+        "name": "void--ego-vehicle",
+        "evaluate": True,
+    },
+    {
+        "color": [0, 0, 0],
+        "instances": False,
+        "readable": "Unlabeled",
+        "name": "void--unlabeled",
+        "evaluate": False,
+    },
+]
+
+
+def _get_mapillary_vistas_meta():
+    stuff_classes = [k["readable"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
+    assert len(stuff_classes) == 65
+
+    stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES if k["evaluate"]]
+    assert len(stuff_colors) == 65
+
+    ret = {
+        "stuff_classes": stuff_classes,
+        "stuff_colors": stuff_colors,
+    }
+    return ret
+
+
+def register_all_mapillary_vistas(root):
+    root = os.path.join(root, "mapillary_vistas")
+    meta = _get_mapillary_vistas_meta()
+    for name, dirname in [("train", "training"), ("val", "validation")]:
+        image_dir = os.path.join(root, dirname, "images")
+        gt_dir = os.path.join(root, dirname, "labels")
+        name = f"mapillary_vistas_sem_seg_{name}"
+        DatasetCatalog.register(
+            name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
+        )
+        MetadataCatalog.get(name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=65,  # different from other datasets, Mapillary Vistas sets ignore_label to 65
+            **meta,
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_mapillary_vistas(_root)
diff --git a/detrex/data/datasets/register_mapillary_vistas_panoptic.py b/detrex/data/datasets/register_mapillary_vistas_panoptic.py
new file mode 100644
index 00000000..8872204a
--- /dev/null
+++ b/detrex/data/datasets/register_mapillary_vistas_panoptic.py
@@ -0,0 +1,528 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_mapillary_vistas.py
+# ------------------------------------------------------------------------------------------------
+
+import json
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.file_io import PathManager
+
+
+MAPILLARY_VISTAS_SEM_SEG_CATEGORIES = [
+    {'color': [165, 42, 42],
+    'id': 1,
+    'isthing': 1,
+    'name': 'Bird',
+    'supercategory': 'animal--bird'},
+    {'color': [0, 192, 0],
+    'id': 2,
+    'isthing': 1,
+    'name': 'Ground Animal',
+    'supercategory': 'animal--ground-animal'},
+    {'color': [196, 196, 196],
+    'id': 3,
+    'isthing': 0,
+    'name': 'Curb',
+    'supercategory': 'construction--barrier--curb'},
+    {'color': [190, 153, 153],
+    'id': 4,
+    'isthing': 0,
+    'name': 'Fence',
+    'supercategory': 'construction--barrier--fence'},
+    {'color': [180, 165, 180],
+    'id': 5,
+    'isthing': 0,
+    'name': 'Guard Rail',
+    'supercategory': 'construction--barrier--guard-rail'},
+    {'color': [90, 120, 150],
+    'id': 6,
+    'isthing': 0,
+    'name': 'Barrier',
+    'supercategory': 'construction--barrier--other-barrier'},
+    {'color': [102, 102, 156],
+    'id': 7,
+    'isthing': 0,
+    'name': 'Wall',
+    'supercategory': 'construction--barrier--wall'},
+    {'color': [128, 64, 255],
+    'id': 8,
+    'isthing': 0,
+    'name': 'Bike Lane',
+    'supercategory': 'construction--flat--bike-lane'},
+    {'color': [140, 140, 200],
+    'id': 9,
+    'isthing': 1,
+    'name': 'Crosswalk - Plain',
+    'supercategory': 'construction--flat--crosswalk-plain'},
+    {'color': [170, 170, 170],
+    'id': 10,
+    'isthing': 0,
+    'name': 'Curb Cut',
+    'supercategory': 'construction--flat--curb-cut'},
+    {'color': [250, 170, 160],
+    'id': 11,
+    'isthing': 0,
+    'name': 'Parking',
+    'supercategory': 'construction--flat--parking'},
+    {'color': [96, 96, 96],
+    'id': 12,
+    'isthing': 0,
+    'name': 'Pedestrian Area',
+    'supercategory': 'construction--flat--pedestrian-area'},
+    {'color': [230, 150, 140],
+    'id': 13,
+    'isthing': 0,
+    'name': 'Rail Track',
+    'supercategory': 'construction--flat--rail-track'},
+    {'color': [128, 64, 128],
+    'id': 14,
+    'isthing': 0,
+    'name': 'Road',
+    'supercategory': 'construction--flat--road'},
+    {'color': [110, 110, 110],
+    'id': 15,
+    'isthing': 0,
+    'name': 'Service Lane',
+    'supercategory': 'construction--flat--service-lane'},
+    {'color': [244, 35, 232],
+    'id': 16,
+    'isthing': 0,
+    'name': 'Sidewalk',
+    'supercategory': 'construction--flat--sidewalk'},
+    {'color': [150, 100, 100],
+    'id': 17,
+    'isthing': 0,
+    'name': 'Bridge',
+    'supercategory': 'construction--structure--bridge'},
+    {'color': [70, 70, 70],
+    'id': 18,
+    'isthing': 0,
+    'name': 'Building',
+    'supercategory': 'construction--structure--building'},
+    {'color': [150, 120, 90],
+    'id': 19,
+    'isthing': 0,
+    'name': 'Tunnel',
+    'supercategory': 'construction--structure--tunnel'},
+    {'color': [220, 20, 60],
+    'id': 20,
+    'isthing': 1,
+    'name': 'Person',
+    'supercategory': 'human--person'},
+    {'color': [255, 0, 0],
+    'id': 21,
+    'isthing': 1,
+    'name': 'Bicyclist',
+    'supercategory': 'human--rider--bicyclist'},
+    {'color': [255, 0, 100],
+    'id': 22,
+    'isthing': 1,
+    'name': 'Motorcyclist',
+    'supercategory': 'human--rider--motorcyclist'},
+    {'color': [255, 0, 200],
+    'id': 23,
+    'isthing': 1,
+    'name': 'Other Rider',
+    'supercategory': 'human--rider--other-rider'},
+    {'color': [200, 128, 128],
+    'id': 24,
+    'isthing': 1,
+    'name': 'Lane Marking - Crosswalk',
+    'supercategory': 'marking--crosswalk-zebra'},
+    {'color': [255, 255, 255],
+    'id': 25,
+    'isthing': 0,
+    'name': 'Lane Marking - General',
+    'supercategory': 'marking--general'},
+    {'color': [64, 170, 64],
+    'id': 26,
+    'isthing': 0,
+    'name': 'Mountain',
+    'supercategory': 'nature--mountain'},
+    {'color': [230, 160, 50],
+    'id': 27,
+    'isthing': 0,
+    'name': 'Sand',
+    'supercategory': 'nature--sand'},
+    {'color': [70, 130, 180],
+    'id': 28,
+    'isthing': 0,
+    'name': 'Sky',
+    'supercategory': 'nature--sky'},
+    {'color': [190, 255, 255],
+    'id': 29,
+    'isthing': 0,
+    'name': 'Snow',
+    'supercategory': 'nature--snow'},
+    {'color': [152, 251, 152],
+    'id': 30,
+    'isthing': 0,
+    'name': 'Terrain',
+    'supercategory': 'nature--terrain'},
+    {'color': [107, 142, 35],
+    'id': 31,
+    'isthing': 0,
+    'name': 'Vegetation',
+    'supercategory': 'nature--vegetation'},
+    {'color': [0, 170, 30],
+    'id': 32,
+    'isthing': 0,
+    'name': 'Water',
+    'supercategory': 'nature--water'},
+    {'color': [255, 255, 128],
+    'id': 33,
+    'isthing': 1,
+    'name': 'Banner',
+    'supercategory': 'object--banner'},
+    {'color': [250, 0, 30],
+    'id': 34,
+    'isthing': 1,
+    'name': 'Bench',
+    'supercategory': 'object--bench'},
+    {'color': [100, 140, 180],
+    'id': 35,
+    'isthing': 1,
+    'name': 'Bike Rack',
+    'supercategory': 'object--bike-rack'},
+    {'color': [220, 220, 220],
+    'id': 36,
+    'isthing': 1,
+    'name': 'Billboard',
+    'supercategory': 'object--billboard'},
+    {'color': [220, 128, 128],
+    'id': 37,
+    'isthing': 1,
+    'name': 'Catch Basin',
+    'supercategory': 'object--catch-basin'},
+    {'color': [222, 40, 40],
+    'id': 38,
+    'isthing': 1,
+    'name': 'CCTV Camera',
+    'supercategory': 'object--cctv-camera'},
+    {'color': [100, 170, 30],
+    'id': 39,
+    'isthing': 1,
+    'name': 'Fire Hydrant',
+    'supercategory': 'object--fire-hydrant'},
+    {'color': [40, 40, 40],
+    'id': 40,
+    'isthing': 1,
+    'name': 'Junction Box',
+    'supercategory': 'object--junction-box'},
+    {'color': [33, 33, 33],
+    'id': 41,
+    'isthing': 1,
+    'name': 'Mailbox',
+    'supercategory': 'object--mailbox'},
+    {'color': [100, 128, 160],
+    'id': 42,
+    'isthing': 1,
+    'name': 'Manhole',
+    'supercategory': 'object--manhole'},
+    {'color': [142, 0, 0],
+    'id': 43,
+    'isthing': 1,
+    'name': 'Phone Booth',
+    'supercategory': 'object--phone-booth'},
+    {'color': [70, 100, 150],
+    'id': 44,
+    'isthing': 0,
+    'name': 'Pothole',
+    'supercategory': 'object--pothole'},
+    {'color': [210, 170, 100],
+    'id': 45,
+    'isthing': 1,
+    'name': 'Street Light',
+    'supercategory': 'object--street-light'},
+    {'color': [153, 153, 153],
+    'id': 46,
+    'isthing': 1,
+    'name': 'Pole',
+    'supercategory': 'object--support--pole'},
+    {'color': [128, 128, 128],
+    'id': 47,
+    'isthing': 1,
+    'name': 'Traffic Sign Frame',
+    'supercategory': 'object--support--traffic-sign-frame'},
+    {'color': [0, 0, 80],
+    'id': 48,
+    'isthing': 1,
+    'name': 'Utility Pole',
+    'supercategory': 'object--support--utility-pole'},
+    {'color': [250, 170, 30],
+    'id': 49,
+    'isthing': 1,
+    'name': 'Traffic Light',
+    'supercategory': 'object--traffic-light'},
+    {'color': [192, 192, 192],
+    'id': 50,
+    'isthing': 1,
+    'name': 'Traffic Sign (Back)',
+    'supercategory': 'object--traffic-sign--back'},
+    {'color': [220, 220, 0],
+    'id': 51,
+    'isthing': 1,
+    'name': 'Traffic Sign (Front)',
+    'supercategory': 'object--traffic-sign--front'},
+    {'color': [140, 140, 20],
+    'id': 52,
+    'isthing': 1,
+    'name': 'Trash Can',
+    'supercategory': 'object--trash-can'},
+    {'color': [119, 11, 32],
+    'id': 53,
+    'isthing': 1,
+    'name': 'Bicycle',
+    'supercategory': 'object--vehicle--bicycle'},
+    {'color': [150, 0, 255],
+    'id': 54,
+    'isthing': 1,
+    'name': 'Boat',
+    'supercategory': 'object--vehicle--boat'},
+    {'color': [0, 60, 100],
+    'id': 55,
+    'isthing': 1,
+    'name': 'Bus',
+    'supercategory': 'object--vehicle--bus'},
+    {'color': [0, 0, 142],
+    'id': 56,
+    'isthing': 1,
+    'name': 'Car',
+    'supercategory': 'object--vehicle--car'},
+    {'color': [0, 0, 90],
+    'id': 57,
+    'isthing': 1,
+    'name': 'Caravan',
+    'supercategory': 'object--vehicle--caravan'},
+    {'color': [0, 0, 230],
+    'id': 58,
+    'isthing': 1,
+    'name': 'Motorcycle',
+    'supercategory': 'object--vehicle--motorcycle'},
+    {'color': [0, 80, 100],
+    'id': 59,
+    'isthing': 0,
+    'name': 'On Rails',
+    'supercategory': 'object--vehicle--on-rails'},
+    {'color': [128, 64, 64],
+    'id': 60,
+    'isthing': 1,
+    'name': 'Other Vehicle',
+    'supercategory': 'object--vehicle--other-vehicle'},
+    {'color': [0, 0, 110],
+    'id': 61,
+    'isthing': 1,
+    'name': 'Trailer',
+    'supercategory': 'object--vehicle--trailer'},
+    {'color': [0, 0, 70],
+    'id': 62,
+    'isthing': 1,
+    'name': 'Truck',
+    'supercategory': 'object--vehicle--truck'},
+    {'color': [0, 0, 192],
+    'id': 63,
+    'isthing': 1,
+    'name': 'Wheeled Slow',
+    'supercategory': 'object--vehicle--wheeled-slow'},
+    {'color': [32, 32, 32],
+    'id': 64,
+    'isthing': 0,
+    'name': 'Car Mount',
+    'supercategory': 'void--car-mount'},
+    {'color': [120, 10, 10],
+    'id': 65,
+    'isthing': 0,
+    'name': 'Ego Vehicle',
+    'supercategory': 'void--ego-vehicle'}
+]
+
+
+def load_mapillary_vistas_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = ann["image_id"]
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+
+
+def register_mapillary_vistas_panoptic(
+    name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
+):
+    """
+    Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
+    The dictionaries in this registered dataset follows detectron2's standard format.
+    Hence it's called "standard".
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "ade20k_panoptic_train"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images in COCO format
+        panoptic_json (str): path to the json panoptic annotation file in COCO format
+        sem_seg_root (none): not used, to be consistent with
+            `register_coco_panoptic_separated`.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: load_mapillary_vistas_panoptic_json(
+            panoptic_json, image_root, panoptic_root, semantic_root, metadata
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="mapillary_vistas_panoptic_seg",
+        ignore_label=65,  # different from other datasets, Mapillary Vistas sets ignore_label to 65
+        label_divisor=1000,
+        **metadata,
+    )
+
+
+_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
+    "mapillary_vistas_panoptic_train": (
+        "mapillary_vistas/training/images",
+        "mapillary_vistas/training/panoptic",
+        "mapillary_vistas/training/panoptic/panoptic_2018.json",
+        "mapillary_vistas/training/labels",
+    ),
+    "mapillary_vistas_panoptic_val": (
+        "mapillary_vistas/validation/images",
+        "mapillary_vistas/validation/panoptic",
+        "mapillary_vistas/validation/panoptic/panoptic_2018.json",
+        "mapillary_vistas/validation/labels",
+    ),
+}
+
+
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
+    thing_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
+    stuff_classes = [k["name"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
+    stuff_colors = [k["color"] for k in MAPILLARY_VISTAS_SEM_SEG_CATEGORIES]
+
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+
+    for i, cat in enumerate(MAPILLARY_VISTAS_SEM_SEG_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+    return meta
+
+
+def register_all_mapillary_vistas_panoptic(root):
+    metadata = get_metadata()
+    for (
+        prefix,
+        (image_root, panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
+        # The "standard" version of COCO panoptic segmentation dataset,
+        # e.g. used by Panoptic-DeepLab
+        register_mapillary_vistas_panoptic(
+            prefix,
+            metadata,
+            os.path.join(root, image_root),
+            os.path.join(root, panoptic_root),
+            os.path.join(root, semantic_root),
+            os.path.join(root, panoptic_json),
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_mapillary_vistas_panoptic(_root)
diff --git a/detrex/data/transforms/__init__.py b/detrex/data/transforms/__init__.py
new file mode 100644
index 00000000..e86aff31
--- /dev/null
+++ b/detrex/data/transforms/__init__.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .color_augmentation import ColorAugSSDTransform
\ No newline at end of file
diff --git a/detrex/data/transforms/color_augmentation.py b/detrex/data/transforms/color_augmentation.py
new file mode 100644
index 00000000..033add94
--- /dev/null
+++ b/detrex/data/transforms/color_augmentation.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/color_augmentation.py
+# ------------------------------------------------------------------------------------------------
+
+import numpy as np
+import random
+import cv2
+from fvcore.transforms.transform import Transform
+
+
+class ColorAugSSDTransform(Transform):
+    """
+    A color related data augmentation used in Single Shot Multibox Detector (SSD).
+    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
+       Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
+       SSD: Single Shot MultiBox Detector. ECCV 2016.
+    Implementation based on:
+     https://github.com/weiliu89/caffe/blob
+       /4817bf8b4200b35ada8ed0dc378dceaf38c539e4
+       /src/caffe/util/im_transforms.cpp
+     https://github.com/chainer/chainercv/blob
+       /7159616642e0be7c5b3ef380b848e16b7e99355b/chainercv
+       /links/model/ssd/transforms.py
+    """
+
+    def __init__(
+        self,
+        img_format,
+        brightness_delta=32,
+        contrast_low=0.5,
+        contrast_high=1.5,
+        saturation_low=0.5,
+        saturation_high=1.5,
+        hue_delta=18,
+    ):
+        super().__init__()
+        assert img_format in ["BGR", "RGB"]
+        self.is_rgb = img_format == "RGB"
+        del img_format
+        self._set_attributes(locals())
+
+    def apply_coords(self, coords):
+        return coords
+
+    def apply_segmentation(self, segmentation):
+        return segmentation
+
+    def apply_image(self, img, interp=None):
+        if self.is_rgb:
+            img = img[:, :, [2, 1, 0]]
+        img = self.brightness(img)
+        if random.randrange(2):
+            img = self.contrast(img)
+            img = self.saturation(img)
+            img = self.hue(img)
+        else:
+            img = self.saturation(img)
+            img = self.hue(img)
+            img = self.contrast(img)
+        if self.is_rgb:
+            img = img[:, :, [2, 1, 0]]
+        return img
+
+    def convert(self, img, alpha=1, beta=0):
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        if random.randrange(2):
+            return self.convert(
+                img, beta=random.uniform(-self.brightness_delta, self.brightness_delta)
+            )
+        return img
+
+    def contrast(self, img):
+        if random.randrange(2):
+            return self.convert(img, alpha=random.uniform(self.contrast_low, self.contrast_high))
+        return img
+
+    def saturation(self, img):
+        if random.randrange(2):
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1], alpha=random.uniform(self.saturation_low, self.saturation_high)
+            )
+            return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+        return img
+
+    def hue(self, img):
+        if random.randrange(2):
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+            img[:, :, 0] = (
+                img[:, :, 0].astype(int) + random.randint(-self.hue_delta, self.hue_delta)
+            ) % 180
+            return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+        return 
diff --git a/projects/maskdino/README.md b/projects/maskdino/README.md
new file mode 100644
index 00000000..69e13882
--- /dev/null
+++ b/projects/maskdino/README.md
@@ -0,0 +1,301 @@
+Mask DINO <img src="assets/dinosaur.png" width="30">
+========
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask-dino-towards-a-unified-transformer-based-1/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=mask-dino-towards-a-unified-transformer-based-1)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask-dino-towards-a-unified-transformer-based-1/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=mask-dino-towards-a-unified-transformer-based-1)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask-dino-towards-a-unified-transformer-based-1/semantic-segmentation-on-ade20k)](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k?p=mask-dino-towards-a-unified-transformer-based-1)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask-dino-towards-a-unified-transformer-based-1/panoptic-segmentation-on-coco-minival)](https://paperswithcode.com/sota/panoptic-segmentation-on-coco-minival?p=mask-dino-towards-a-unified-transformer-based-1)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask-dino-towards-a-unified-transformer-based-1/panoptic-segmentation-on-coco-test-dev)](https://paperswithcode.com/sota/panoptic-segmentation-on-coco-test-dev?p=mask-dino-towards-a-unified-transformer-based-1)
+
+By [Feng Li*](https://fengli-ust.github.io/), [Hao Zhang*](https://scholar.google.com/citations?user=B8hPxMQAAAAJ&hl=zh-CN), [Huaizhe Xu](https://scholar.google.com/citations?user=zgaTShsAAAAJ&hl=en&scioq=Huaizhe+Xu), [Shilong Liu](https://www.lsl.zone/), [Lei Zhang](https://scholar.google.com/citations?hl=zh-CN&user=fIlGZToAAAAJ), [Lionel M. Ni](https://scholar.google.com/citations?hl=zh-CN&user=OzMYwDIAAAAJ), and [Heung-Yeung Shum](https://scholar.google.com.hk/citations?user=9akH-n8AAAAJ&hl=en).
+
+This repository is an official detrex implementation of the [Mask DINO: Towards A Unified Transformer-based
+Framework for Object Detection and Segmentation](https://arxiv.org/abs/2206.02777) (DINO pronounced `daɪnoʊ' as in dinosaur). The source code is available at [MaskDINO](https://github.com/IDEA-Research/MaskDINO).
+
+
+### Features 
+
+* A unified architecture for object detection, panoptic, instance and semantic segmentation.
+* Achieve task and data cooperation between detection and segmentation.
+* State-of-the-art performance under the same setting.
+* Support major detection and segmentation datasets: COCO, ADE20K, Cityscapes,
+
+### Code Updates
+
+* [2022/11] Our code is available! Achieve <strong>51.7</strong> and <strong>59.0</strong> AP with a ResNet-50 and SwinL without extra detection data on COCO, better detection performance compared with DINO!
+
+* [2022/6] We release a unified detection and segmentation model [Mask DINO](https://arxiv.org/pdf/2206.02777.pdf) that achieves the best results on all the three segmentation tasks (**54.7** AP on [COCO instance leaderboard](https://paperswithcode.com/sota/instance-segmentation-on-coco), **59.5** PQ on [COCO panoptic leaderboard](https://paperswithcode.com/sota/panoptic-segmentation-on-coco-test-dev), and **60.8** mIoU on [ADE20K semantic leaderboard](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k))!.
+
+
+![MaskDINO](assets/framework.jpg)
+
+
+## Installation
+
+See [installation instructions](INSTALL.md).
+
+## Getting Started
+
+See [Results](#results).
+
+See [Preparing Datasets for MaskDINO](datasets/README.md).
+
+See [More Usage](#more-usage).
+
+# Results
+## Released Models
+### COCO Instance Segmentation and Object Detection
+In this part, we follow DINO to use hidden dimension `2048` in the encoder by default. We also use the mask-enhanced
+box initialization proposed in our paper by default. To better present our model, we also list the models trained with 
+hidden dimension `1024` (`hid 1024`) and not using mask-enhance initialization (`no mask enhance`) in this table.
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">Epochs</th>
+<th valign="bottom">Mask AP</th>
+<th valign="bottom">Box AP</th>
+<th valign="bottom">Params</th>
+<th valign="bottom">GFlops</th>
+<th valign="bottom">download</th>
+
+ <tr><td align="left">MaskDINO (hid 1024)</td>
+<td align="center">R50</td>
+<td align="center">50</td>
+<td align="center">46.1</td>
+<td align="center">51.5</td>
+<td align="center">47M</td>
+<td align="center">226</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_300q_hid1024_3sd1_instance_maskenhanced_mask46.1ap_box51.5ap.pth">model</a></td>
+</tr>
+
+ <tr><td align="left">MaskDINO | <a href="configs/maskdino_r50_coco_instance_seg_50ep.py">config</a></td>
+<td align="center">R50</td>
+<td align="center">50</td>
+<td align="center">46.3</td>
+<td align="center">51.7</td>
+<td align="center">52M</td>
+<td align="center">286</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_300q_hid2048_3sd1_instance_maskenhanced_mask46.3ap_box51.7ap.pth">model</a></td>
+</tr>
+
+ 
+</tbody></table>
+
+### COCO Panoptic Segmentation
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">epochs</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">Mask AP</th>
+<th valign="bottom">Box AP</th>
+<th valign="bottom">mIoU</th>
+<th valign="bottom">download</th>
+
+ <tr><td align="left">MaskDINO | <a href="configs/maskdino_r50_coco_panoptic_seg_50ep.py">config</a></td>
+<td align="center">R50</td>
+<td align="center">50</td>
+<td align="center">53.0</td>
+<td align="center">48.8</td>
+<td align="center">44.3</td>
+<td align="center">60.6</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_300q_hid2048_3sd1_panoptic_pq53.0.pth">model</a></td>
+
+</tbody></table>
+
+## To Release
+These models can be found and tested in [MaskDINO Source Code](https://github.com/IDEA-Research/MaskDINO) and will be available soon in detrex.
+
+### COCO Instance Segmentation and Object Detection
+In this part, we follow DINO to use hidden dimension `2048` in the encoder by default. We also use the mask-enhanced
+box initialization proposed in our paper by default. To better present our model, we also list the models trained with 
+hidden dimension `1024` (`hid 1024`) and not using mask-enhance initialization (`no mask enhance`) in this table.
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">Epochs</th>
+<th valign="bottom">Mask AP</th>
+<th valign="bottom">Box AP</th>
+<th valign="bottom">Params</th>
+<th valign="bottom">GFlops</th>
+<th valign="bottom">download</th>
+
+ 
+
+ <tr><td align="left">MaskDINO (no mask enhance) | <a href="configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml">config</a></td>
+<td align="center">Swin-L (IN21k)</td>
+<td align="center">50</td>
+<td align="center">52.1</td>
+<td align="center">58.3</td>
+<td align="center">223</td>
+<td align="center">1326</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_swinl_50ep_300q_hid2048_3sd1_instance_mask52.1ap_box58.3ap.pth">model</a></td>
+</tr>
+
+ <tr><td align="left">MaskDINO | <a href="configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml">config</a></td>
+<td align="center">Swin-L (IN21k)</td>
+<td align="center">50</td>
+<td align="center">52.3</td>
+<td align="center">59.0</td>
+<td align="center">223</td>
+<td align="center">1326</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_swinl_50ep_300q_hid2048_3sd1_instance_maskenhanced_mask52.3ap_box59.0ap.pth">model</a></td>
+</tr>
+</tbody></table>
+
+### COCO Panoptic Segmentation
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">epochs</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">Mask AP</th>
+<th valign="bottom">Box AP</th>
+<th valign="bottom">mIoU</th>
+<th valign="bottom">download</th>
+
+ <tr><td align="left">MaskDINO | <a href="configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml">config</a></td>
+<td align="center">Swin-L (IN21k)</td>
+<td align="center">50</td>
+<td align="center">58.3</td>
+<td align="center">50.6</td>
+<td align="center">56.2</td>
+<td align="center">67.5</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_swinl_50ep_300q_hid2048_3sd1_panoptic_58.3pq.pth">model</a></td>
+</tr>
+</tbody></table>
+
+### ADE20K Semantic Segmentation
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">Dataset</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">iterations</th>
+<th valign="bottom">mIoU</th>
+<th valign="bottom">download</th>
+
+ <tr><td align="left">MaskDINO | <a href="configs/cityscapes/semantic-segmentation/maskdino_R50_bs16_90k_steplr.yaml">config</a></td>
+<td align="center">ADE20K</td>
+<td align="center">R50</td>
+<td align="center">160k</td>
+<td align="center">48.7</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_100q_celoss_hid1024_3s_semantic_ade20k_48.7miou.pth">model</a></td>
+
+ <tr><td align="left">MaskDINO | <a href="configs/cityscapes/semantic-segmentation/maskdino_R50_bs16_90k_steplr.yaml">config</a></td>
+<td align="center">Cityscapes</td>
+<td align="center">R50</td>
+<td align="center">90k</td>
+<td align="center">79.8</td>
+<td align="center"><a href="https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_100q_celoss_hid1024_3s_semantic_cityscapes_79.8miou.pth">model</a></td>
+
+</tbody></table>
+
+All models were trained with  **4** NVIDIA A100 GPUs (ResNet-50 based models) or **8** NVIDIA A100 GPUs (Swin-L based models).
+
+
+
+# More Usage
+
+### Mask-enhanced box initialization
+
+We provide 2 ways to convert predicted masks to boxes to initialize decoder boxes. You can set as follows
+* `MODEL.MaskDINO.INITIALIZE_BOX_TYPE: no` not using mask enhanced box initialization
+* `MODEL.MaskDINO.INITIALIZE_BOX_TYPE: mask2box`  a fast conversion way
+* `MODEL.MaskDINO.INITIALIZE_BOX_TYPE: bitmask`  provided conversion from detectron2, slower but more accurate conversion. 
+
+These two conversion ways do not affect the final performance much, you can choose either way. 
+
+In addition, if you already
+train a model for 50 epochs without mask-enhance box initialization, you can plug in this method and simply 
+finetune the model in the last few epochs (i.e., load from 32K iteration trained model and finetune it). This way can
+also achieve similar performance compared with training from scratch, but more flexible.
+
+### Model components
+MaskDINO  consists of three components: a backbone, a pixel decoder and a Transformer decoder.
+You can easily replace each of these three components with your own implementation.
+
+* **backbone**: Define and register your backbone under `maskdino/modeling/backbone`. You can follow the Swin Transformer as an example.
+  
+* **pixel decoder**: pixel decoder is actually the multi-scale encoder in DINO and Deformable DETR, we follow mask2former to call
+  it pixel decoder. It is in `maskdino/modeling/pixel_decoder`, you can change your multi-scale encoder. The returned values 
+  include 
+  1. `mask_features` is the per-pixel embeddings with resolution 1/4 of the original image, obtained by fusing backbone 1/4 features and multi-scale encoder encoded 1/8 features. This is used to produce binary masks.
+  2. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder.
+  For ResNet-50 models with 4 scales, we use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here, and follow DINO to additionally downsample
+     1/32 to get a 4th scale with 1/64 resolution. For 5-scale models with SwinL, we additional use 1/4 resolution features as in DINO.
+
+* **transformer decoder**: it mainly follows DINO decoder to do detection and segmentation tasks. It is defined in `maskdino/modeling/transformer_decoder`.
+
+
+## LICNESE
+Mask DINO is released under the Apache 2.0 license. Please see the [LICENSE](LICNESE) file for more information.
+
+Copyright (c) IDEA. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use these files except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+
+## <a name="CitingMaskDINO"></a>Citing Mask DINO
+
+If you find our work helpful for your research, please consider citing the following BibTeX entry.
+
+```BibTeX
+@misc{li2022mask,
+      title={Mask DINO: Towards A Unified Transformer-based Framework for Object Detection and Segmentation}, 
+      author={Feng Li and Hao Zhang and Huaizhe xu and Shilong Liu and Lei Zhang and Lionel M. Ni and Heung-Yeung Shum},
+      year={2022},
+      eprint={2206.02777},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+If you find the code useful, please also consider the following BibTeX entry.
+
+```BibTeX
+@misc{zhang2022dino,
+      title={DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection}, 
+      author={Hao Zhang and Feng Li and Shilong Liu and Lei Zhang and Hang Su and Jun Zhu and Lionel M. Ni and Heung-Yeung Shum},
+      year={2022},
+      eprint={2203.03605},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@inproceedings{li2022dn,
+      title={Dn-detr: Accelerate detr training by introducing query denoising},
+      author={Li, Feng and Zhang, Hao and Liu, Shilong and Guo, Jian and Ni, Lionel M and Zhang, Lei},
+      booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+      pages={13619--13627},
+      year={2022}
+}
+
+@inproceedings{
+      liu2022dabdetr,
+      title={{DAB}-{DETR}: Dynamic Anchor Boxes are Better Queries for {DETR}},
+      author={Shilong Liu and Feng Li and Hao Zhang and Xiao Yang and Xianbiao Qi and Hang Su and Jun Zhu and Lei Zhang},
+      booktitle={International Conference on Learning Representations},
+      year={2022},
+      url={https://openreview.net/forum?id=oMI9PjOb9Jl}
+}
+```
+
+## Acknowledgement
+
+Many thanks to these excellent opensource projects 
+* [Mask2Former](https://github.com/facebookresearch/Mask2Former) 
+* [DINO](https://github.com/IDEA-Research/DINO)
+
diff --git a/projects/maskdino/assets/dinosaur.png b/projects/maskdino/assets/dinosaur.png
new file mode 100644
index 00000000..b1a1e3d1
Binary files /dev/null and b/projects/maskdino/assets/dinosaur.png differ
diff --git a/projects/maskdino/assets/framework.jpg b/projects/maskdino/assets/framework.jpg
new file mode 100644
index 00000000..e4d5872c
Binary files /dev/null and b/projects/maskdino/assets/framework.jpg differ
diff --git a/projects/maskdino/assets/instance.png b/projects/maskdino/assets/instance.png
new file mode 100644
index 00000000..7e764d8c
Binary files /dev/null and b/projects/maskdino/assets/instance.png differ
diff --git a/projects/maskdino/assets/panoptic.png b/projects/maskdino/assets/panoptic.png
new file mode 100644
index 00000000..86395c32
Binary files /dev/null and b/projects/maskdino/assets/panoptic.png differ
diff --git a/projects/maskdino/assets/semantic.png b/projects/maskdino/assets/semantic.png
new file mode 100644
index 00000000..461965d2
Binary files /dev/null and b/projects/maskdino/assets/semantic.png differ
diff --git a/projects/maskdino/assets/sota.png b/projects/maskdino/assets/sota.png
new file mode 100644
index 00000000..2ff9b368
Binary files /dev/null and b/projects/maskdino/assets/sota.png differ
diff --git a/projects/maskdino/configs/data/coco_instance_seg.py b/projects/maskdino/configs/data/coco_instance_seg.py
new file mode 100644
index 00000000..38306808
--- /dev/null
+++ b/projects/maskdino/configs/data/coco_instance_seg.py
@@ -0,0 +1,50 @@
+from omegaconf import OmegaConf
+
+import detectron2.data.transforms as T
+from detectron2.config import LazyCall as L
+from detectron2.data import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+)
+from detectron2.evaluation import COCOEvaluator
+
+# from detrex.data import DetrDatasetMapper
+# from projects.maskDINO.data.dataset_mappers.coco_instance_lsj_aug_dataset_mapper import COCOInstanceLSJDatasetMapper, build_transform_gen
+from detrex.data.dataset_mappers import COCOInstanceNewBaselineDatasetMapper,coco_instance_transform_gen
+dataloader = OmegaConf.create()
+
+dataloader.train = L(build_detection_train_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
+    mapper=L(COCOInstanceNewBaselineDatasetMapper)(
+        augmentation=L(coco_instance_transform_gen)(
+            image_size=1024,
+            min_scale=0.1,
+            max_scale=2.0,
+            random_flip="horizontal"
+        ),
+        is_train=True,
+        image_format="RGB",
+    ),
+    total_batch_size=16,
+    num_workers=4,
+)
+
+dataloader.test = L(build_detection_test_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
+    mapper=L(COCOInstanceNewBaselineDatasetMapper)(
+        augmentation=[
+            L(T.ResizeShortestEdge)(
+                short_edge_length=800,
+                max_size=1333,
+            ),
+        ],
+        is_train=False,
+        image_format="RGB",
+    ),
+    num_workers=4,
+)
+
+dataloader.evaluator = L(COCOEvaluator)(
+    dataset_name="${..test.dataset.names}",
+)
diff --git a/projects/maskdino/configs/data/coco_panoptic_seg.py b/projects/maskdino/configs/data/coco_panoptic_seg.py
new file mode 100644
index 00000000..049d612d
--- /dev/null
+++ b/projects/maskdino/configs/data/coco_panoptic_seg.py
@@ -0,0 +1,61 @@
+from omegaconf import OmegaConf
+
+import detectron2.data.transforms as T
+from detectron2.config import LazyCall as L
+from detectron2.data import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+)
+from detectron2.evaluation import COCOPanopticEvaluator,COCOEvaluator,SemSegEvaluator,DatasetEvaluators
+
+from detrex.data.dataset_mappers import COCOPanopticNewBaselineDatasetMapper, coco_panoptic_transform_gen
+dataloader = OmegaConf.create()
+
+dataloader.train = L(build_detection_train_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_train_panoptic"),
+    mapper=L(COCOPanopticNewBaselineDatasetMapper)(
+        augmentation=L(coco_panoptic_transform_gen)(
+            image_size=1024,
+            min_scale=0.1,
+            max_scale=2.0,
+            random_flip="horizontal"
+        ),
+        is_train=True,
+        image_format="RGB",
+    ),
+    total_batch_size=16,
+    num_workers=4,
+)
+
+dataloader.test = L(build_detection_test_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_val_panoptic_with_sem_seg", filter_empty=False),
+    mapper=L(COCOPanopticNewBaselineDatasetMapper)(
+        augmentation=[
+            L(T.ResizeShortestEdge)(
+                short_edge_length=800,
+                max_size=1333,
+            ),
+        ],
+        is_train=False,
+        image_format="RGB",
+    ),
+    num_workers=4,
+)
+
+# dataloader.evaluator = L(COCOPanopticEvaluator)(
+#     dataset_name="${..test.dataset.names}",
+# )
+dataloader.evaluator = L(DatasetEvaluators)(
+    evaluators=[
+        L(COCOPanopticEvaluator)(
+            dataset_name="coco_2017_val_panoptic_with_sem_seg",
+        ),
+        L(COCOEvaluator)(
+            dataset_name="coco_2017_val_panoptic_with_sem_seg",
+        ),
+        L(SemSegEvaluator)(
+            dataset_name="coco_2017_val_panoptic_with_sem_seg",
+        ),
+    ],
+)
\ No newline at end of file
diff --git a/projects/maskdino/configs/maskdino_r50_coco_instance_seg_50ep.py b/projects/maskdino/configs/maskdino_r50_coco_instance_seg_50ep.py
new file mode 100644
index 00000000..bf4b9522
--- /dev/null
+++ b/projects/maskdino/configs/maskdino_r50_coco_instance_seg_50ep.py
@@ -0,0 +1,63 @@
+from detrex.config import get_config
+from .models.maskdino_r50 import model
+from .data.coco_instance_seg import dataloader
+
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+from detectron2.config import LazyCall as L
+from detectron2.solver import WarmupParamScheduler
+
+train = get_config("common/train.py").train
+# max training iterations
+train.max_iter = 368750
+# warmup lr scheduler
+lr_multiplier = L(WarmupParamScheduler)(
+    scheduler=L(MultiStepParamScheduler)(
+        values=[1.0, 0.1],
+        milestones=[327778, 355092],
+    ),
+    warmup_length=10 / train.max_iter,
+    warmup_factor=1.0,
+)
+
+optimizer = get_config("common/optim.py").AdamW
+# lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_50ep
+
+# initialize checkpoint to be loaded
+train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+train.output_dir = "./output/dab_detr_r50_50ep"
+
+
+# run evaluation every 5000 iters
+train.eval_period = 5000
+
+# log training infomation every 20 iters
+train.log_period = 20
+
+# save checkpoint every 5000 iters
+train.checkpointer.period = 5000
+
+# gradient clipping for training
+train.clip_grad.enabled = True
+train.clip_grad.params.max_norm = 0.01
+train.clip_grad.params.norm_type = 2
+
+# set training devices
+train.device = "cuda"
+
+
+# modify optimizer config
+optimizer.lr = 1e-4
+optimizer.betas = (0.9, 0.999)
+optimizer.weight_decay = 1e-4
+optimizer.params.lr_factor_func = lambda module_name: 0.1 if "backbone" in module_name else 1
+
+# # modify dataloader config
+dataloader.train.num_workers = 16
+#
+# # please notice that this is total batch size.
+# # surpose you're using 4 gpus for training and the batch size for
+# # each gpu is 16/4 = 4
+dataloader.train.total_batch_size = 16
+
+# dump the testing results into output_dir for visualization
+dataloader.evaluator.output_dir = train.output_dir
diff --git a/projects/maskdino/configs/maskdino_r50_coco_panoptic_seg_50ep.py b/projects/maskdino/configs/maskdino_r50_coco_panoptic_seg_50ep.py
new file mode 100644
index 00000000..66a05691
--- /dev/null
+++ b/projects/maskdino/configs/maskdino_r50_coco_panoptic_seg_50ep.py
@@ -0,0 +1,63 @@
+from detrex.config import get_config
+from .models.maskdino_r50 import model
+from .data.coco_panoptic_seg import dataloader
+
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+from detectron2.config import LazyCall as L
+from detectron2.solver import WarmupParamScheduler
+
+train = get_config("common/train.py").train
+# max training iterations
+train.max_iter = 368750
+# warmup lr scheduler
+lr_multiplier = L(WarmupParamScheduler)(
+    scheduler=L(MultiStepParamScheduler)(
+        values=[1.0, 0.1],
+        milestones=[327778, 355092],
+    ),
+    warmup_length=10 / train.max_iter,
+    warmup_factor=1.0,
+)
+model.panoptic_on=True
+model.semantic_on=True
+model.sem_seg_head.transformer_predictor.initialize_box_type="no"
+model.sem_seg_head.num_classes=133
+optimizer = get_config("common/optim.py").AdamW
+
+# initialize checkpoint to be loaded
+train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+train.output_dir = "./output/dab_detr_r50_50ep"
+
+
+# run evaluation every 5000 iters
+train.eval_period = 5000
+
+# log training infomation every 20 iters
+train.log_period = 20
+
+# save checkpoint every 5000 iters
+train.checkpointer.period = 5000
+
+# gradient clipping for training
+train.clip_grad.enabled = True
+train.clip_grad.params.max_norm = 0.01
+train.clip_grad.params.norm_type = 2
+
+# set training devices
+train.device = "cuda"
+
+
+# modify optimizer config
+optimizer.lr = 1e-4
+optimizer.betas = (0.9, 0.999)
+optimizer.weight_decay = 1e-4
+optimizer.params.lr_factor_func = lambda module_name: 0.1 if "backbone" in module_name else 1
+
+# # modify dataloader config
+dataloader.train.num_workers = 16
+#
+# # please notice that this is total batch size.
+# # surpose you're using 4 gpus for training and the batch size for
+# # each gpu is 16/4 = 4
+dataloader.train.total_batch_size = 16
+
diff --git a/projects/maskdino/configs/maskdino_r50_instance_seg_50ep.py b/projects/maskdino/configs/maskdino_r50_instance_seg_50ep.py
new file mode 100644
index 00000000..bf4b9522
--- /dev/null
+++ b/projects/maskdino/configs/maskdino_r50_instance_seg_50ep.py
@@ -0,0 +1,63 @@
+from detrex.config import get_config
+from .models.maskdino_r50 import model
+from .data.coco_instance_seg import dataloader
+
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+from detectron2.config import LazyCall as L
+from detectron2.solver import WarmupParamScheduler
+
+train = get_config("common/train.py").train
+# max training iterations
+train.max_iter = 368750
+# warmup lr scheduler
+lr_multiplier = L(WarmupParamScheduler)(
+    scheduler=L(MultiStepParamScheduler)(
+        values=[1.0, 0.1],
+        milestones=[327778, 355092],
+    ),
+    warmup_length=10 / train.max_iter,
+    warmup_factor=1.0,
+)
+
+optimizer = get_config("common/optim.py").AdamW
+# lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_50ep
+
+# initialize checkpoint to be loaded
+train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+train.output_dir = "./output/dab_detr_r50_50ep"
+
+
+# run evaluation every 5000 iters
+train.eval_period = 5000
+
+# log training infomation every 20 iters
+train.log_period = 20
+
+# save checkpoint every 5000 iters
+train.checkpointer.period = 5000
+
+# gradient clipping for training
+train.clip_grad.enabled = True
+train.clip_grad.params.max_norm = 0.01
+train.clip_grad.params.norm_type = 2
+
+# set training devices
+train.device = "cuda"
+
+
+# modify optimizer config
+optimizer.lr = 1e-4
+optimizer.betas = (0.9, 0.999)
+optimizer.weight_decay = 1e-4
+optimizer.params.lr_factor_func = lambda module_name: 0.1 if "backbone" in module_name else 1
+
+# # modify dataloader config
+dataloader.train.num_workers = 16
+#
+# # please notice that this is total batch size.
+# # surpose you're using 4 gpus for training and the batch size for
+# # each gpu is 16/4 = 4
+dataloader.train.total_batch_size = 16
+
+# dump the testing results into output_dir for visualization
+dataloader.evaluator.output_dir = train.output_dir
diff --git a/projects/maskdino/configs/models/maskdino_r50.py b/projects/maskdino/configs/models/maskdino_r50.py
new file mode 100644
index 00000000..392b606d
--- /dev/null
+++ b/projects/maskdino/configs/models/maskdino_r50.py
@@ -0,0 +1,153 @@
+import torch.nn as nn
+from detrex.layers import PositionEmbeddingSine
+from detrex.modeling.backbone import ResNet, BasicStem
+
+from detectron2.config import LazyCall as L
+
+from projects.maskdino.modeling.meta_arch.maskdino_head import MaskDINOHead
+from projects.maskdino.modeling.pixel_decoder.maskdino_encoder import MaskDINOEncoder
+from projects.maskdino.modeling.transformer_decoder.maskdino_decoder import MaskDINODecoder
+from projects.maskdino.modeling.criterion import SetCriterion
+from projects.maskdino.modeling.matcher import HungarianMatcher
+from projects.maskdino.maskdino import MaskDINO
+from detectron2.data import MetadataCatalog
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+
+
+
+dim=256
+n_class=80
+dn="seg"
+dec_layers = 9
+input_shape={'res2': ShapeSpec(channels=256, height=None, width=None, stride=4), 'res3': ShapeSpec(channels=512, height=None, width=None, stride=8), 'res4': ShapeSpec(channels=1024, height=None, width=None, stride=16), 'res5': ShapeSpec(channels=2048, height=None, width=None, stride=32)}
+model = L(MaskDINO)(
+    backbone=L(ResNet)(
+        stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+        stages=L(ResNet.make_default_stages)(
+            depth=50,
+            stride_in_1x1=False,
+            norm="FrozenBN",
+        ),
+        out_features=["res2", "res3", "res4", "res5"],
+        freeze_at=1,
+    ),
+    sem_seg_head=L(MaskDINOHead)(
+        input_shape=input_shape,
+        num_classes=n_class,
+        pixel_decoder=L(MaskDINOEncoder)(
+            input_shape=input_shape,
+            transformer_dropout=0.0,
+            transformer_nheads=8,
+            transformer_dim_feedforward=2048,
+            transformer_enc_layers=6,
+            conv_dim=dim,
+            mask_dim=dim,
+            norm = 'GN',
+            transformer_in_features=['res3', 'res4', 'res5'],
+            common_stride=4,
+            num_feature_levels=3,
+            total_num_feature_levels=4,
+            feature_order='low2high',
+        ),
+        loss_weight= 1.0,
+        ignore_value= -1,
+        transformer_predictor=L(MaskDINODecoder)(
+            in_channels=dim,
+            mask_classification=True,
+            num_classes="${..num_classes}",
+            hidden_dim=dim,
+            num_queries=300,
+            nheads=8,
+            dim_feedforward=2048,
+            dec_layers=dec_layers,
+            mask_dim=dim,
+            enforce_input_project=False,
+            two_stage=True,
+            dn=dn,
+            noise_scale=0.4,
+            dn_num=100,
+            initialize_box_type='mask2box',
+            initial_pred=True,
+            learn_tgt=False,
+            total_num_feature_levels= 4,
+            dropout = 0.0,
+            activation= 'relu',
+            nhead= 8,
+            dec_n_points= 4,
+            return_intermediate_dec = True,
+            query_dim= 4,
+            dec_layer_share = False,
+            semantic_ce_loss = False,
+        ),
+    ),
+    criterion=L(SetCriterion)(
+        num_classes="${..sem_seg_head.num_classes}",
+        matcher=L(HungarianMatcher)(
+            cost_class = 4.0,
+            cost_mask = 5.0,
+            cost_dice = 5.0,
+            num_points = 12544,
+            cost_box=5.0,
+            cost_giou=2.0,
+            panoptic_on="${..panoptic_on}",
+        ),
+        weight_dict=dict(),
+        eos_coef=0.1,
+        losses=['labels', 'masks', 'boxes'],
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        dn=dn,
+        dn_losses=['labels', 'masks', 'boxes'],
+        panoptic_on="${..panoptic_on}",
+        semantic_ce_loss=False
+    ),
+    num_queries=300,
+    object_mask_threshold=0.25,
+    overlap_threshold=0.8,
+    metadata=MetadataCatalog.get('coco_2017_train'),
+    size_divisibility=32,
+    sem_seg_postprocess_before_inference=True,
+    pixel_mean=[123.675, 116.28, 103.53],
+    pixel_std=[58.395, 57.12, 57.375],
+    # inference
+    semantic_on=False,
+    panoptic_on=False,
+    instance_on=True,
+    test_topk_per_image=100,
+    pano_temp=0.06,
+    focus_on_box = False,
+    transform_eval = True,
+)
+
+# set aux loss weight dict
+class_weight=4.0
+mask_weight=5.0
+dice_weight=5.0
+box_weight=5.0
+giou_weight=2.0
+weight_dict = {"loss_ce": class_weight}
+weight_dict.update({"loss_mask": mask_weight, "loss_dice": dice_weight})
+weight_dict.update({"loss_bbox": box_weight, "loss_giou": giou_weight})
+# two stage is the query selection scheme
+
+interm_weight_dict = {}
+interm_weight_dict.update({k + f'_interm': v for k, v in weight_dict.items()})
+weight_dict.update(interm_weight_dict)
+# denoising training
+
+if dn == "standard":
+    weight_dict.update({k + f"_dn": v for k, v in weight_dict.items() if k != "loss_mask" and k != "loss_dice"})
+    dn_losses = ["labels", "boxes"]
+elif dn == "seg":
+    weight_dict.update({k + f"_dn": v for k, v in weight_dict.items()})
+    dn_losses = ["labels", "masks", "boxes"]
+else:
+    dn_losses = []
+# if deep_supervision:
+
+aux_weight_dict = {}
+for i in range(dec_layers):
+    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+weight_dict.update(aux_weight_dict)
+model.criterion.weight_dict=weight_dict
\ No newline at end of file
diff --git a/projects/maskdino/data/__init__.py b/projects/maskdino/data/__init__.py
new file mode 100644
index 00000000..7f209a83
--- /dev/null
+++ b/projects/maskdino/data/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import datasets
+# from . import datasets_detr
diff --git a/projects/maskdino/data/dataset_mappers/__init__.py b/projects/maskdino/data/dataset_mappers/__init__.py
new file mode 100644
index 00000000..9020c2df
--- /dev/null
+++ b/projects/maskdino/data/dataset_mappers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
diff --git a/projects/maskdino/data/dataset_mappers/coco_instance_lsj_aug_dataset_mapper.py b/projects/maskdino/data/dataset_mappers/coco_instance_lsj_aug_dataset_mapper.py
new file mode 100644
index 00000000..d7d10f70
--- /dev/null
+++ b/projects/maskdino/data/dataset_mappers/coco_instance_lsj_aug_dataset_mapper.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------------------------------
+# COCO Instance Segmentation with LSJ Augmentation
+# Modified from:
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
+# ------------------------------------------------------------------------------------------------
+
+import copy
+import logging
+import numpy as np
+import torch
+
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+from pycocotools import mask as coco_mask
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+def build_transform_gen(
+        image_size,
+        min_scale,
+        max_scale,
+        random_flip: str = "horizontal",
+        is_train: bool = True,
+):
+    """
+    Create a list of default :class:`Augmentation`.
+    Now it includes resizing and flipping.
+
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation."
+    assert random_flip in ["none", "horizontal",
+                           "vertical"], f"Only support none/horizontal/vertical flip, but got {random_flip}"
+
+    augmentation = []
+
+    if random_flip != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=random_flip == "horizontal",
+                vertical=random_flip == "vertical",
+            )
+        )
+
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size,
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size))
+    ])
+
+    return augmentation
+
+
+class COCOInstanceLSJDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(
+            self,
+            is_train=True,
+            *,
+            augmentation,
+            image_format,
+    ):
+        self.augmentation = augmentation
+        logging.getLogger(__name__).info(
+            "[COCO_Instance_LSJ_Augment_Dataset_Mapper] Full TransformGens used in training: {}".format(
+                str(self.augmentation))
+        )
+
+        self.img_format = image_format
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        padding_mask = np.ones(image.shape[:2])
+        image, transforms = T.apply_transform_gens(self.augmentation, image)
+
+        padding_mask = transforms.apply_segmentation(padding_mask)
+        padding_mask = ~ padding_mask.astype(bool)
+
+        image_shape = image.shape[:2]
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
+
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            for anno in dataset_dict["annotations"]:
+                anno.pop("keypoints", None)
+
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            # NOTE: does not support BitMask due to augmentation
+            # Current BitMask cannot handle empty objects
+            instances = utils.annotations_to_instances(annos, image_shape)
+            # After transforms such as cropping are applied, the bounding box may no longer
+            # tightly bound the object. As an example, imagine a triangle object
+            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+            # the intersection of original bounding box and the cropping box.
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            # Need to filter empty instances first (due to augmentation)
+            instances = utils.filter_empty_instances(instances)
+            # Generate masks from polygon
+            h, w = instances.image_size
+            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
+            if hasattr(instances, 'gt_masks'):
+                gt_masks = instances.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                instances.gt_masks = gt_masks
+            # import ipdb; ipdb.set_trace()
+            dataset_dict["instances"] = instances
+
+        return dataset_dict
+
diff --git a/projects/maskdino/evaluation/__init__.py b/projects/maskdino/evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/maskdino/evaluation/instance_evaluation.py b/projects/maskdino/evaluation/instance_evaluation.py
new file mode 100644
index 00000000..bc2facec
--- /dev/null
+++ b/projects/maskdino/evaluation/instance_evaluation.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
+from detectron2.evaluation.fast_eval_api import COCOeval_opt
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+
+
+# modified from COCOEvaluator for instance segmetnat
+class InstanceSegEvaluator(COCOEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            # num_classes = len(all_contiguous_ids)
+            # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in coco_results:
+                category_id = result["category_id"]
+                # assert category_id < num_classes, (
+                #     f"A prediction has class={category_id}, "
+                #     f"but the dataset only has {num_classes} classes and "
+                #     f"predicted class id should be in [0, {num_classes - 1}]."
+                # )
+                assert category_id in reverse_id_mapping, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    use_fast_impl=self._use_fast_impl,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
diff --git a/projects/maskdino/maskdino.py b/projects/maskdino/maskdino.py
new file mode 100644
index 00000000..1e174df4
--- /dev/null
+++ b/projects/maskdino/maskdino.py
@@ -0,0 +1,398 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
+from typing import Tuple
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+
+from .modeling.criterion import SetCriterion
+from .modeling.matcher import HungarianMatcher
+from .utils import box_ops
+
+
+@META_ARCH_REGISTRY.register()
+class MaskDINO(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+
+    # @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        # data_loader: str,
+        pano_temp: float,
+        focus_on_box: bool = False,
+        transform_eval: bool = False,
+        semantic_ce_loss: bool = False,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.pano_temp = pano_temp
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        self.test_topk_per_image = test_topk_per_image
+
+        # self.data_loader = data_loader
+        # if 'detr' in data_loader:
+        # self.flag = eval_flag
+        self.focus_on_box = focus_on_box
+        self.transform_eval = transform_eval
+        self.semantic_ce_loss = semantic_ce_loss
+
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+
+        print('criterion.weight_dict ', self.criterion.weight_dict)
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+
+        if self.training:
+            # dn_args={"scalar":30,"noise_scale":0.4}
+            # mask classification target
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+                # if 'detr' in self.data_loader:
+                #     targets = self.prepare_targets_detr(gt_instances, images)
+                # else:
+                targets = self.prepare_targets(gt_instances, images)
+            else:
+                targets = None
+            outputs,mask_dict = self.sem_seg_head(features,targets=targets)
+            # bipartite matching-based loss
+            losses = self.criterion(outputs, targets,mask_dict)
+
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else:
+                    # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            outputs, _ = self.sem_seg_head(features)
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            mask_box_results = outputs["pred_boxes"]
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            del outputs
+            # import ipdb; ipdb.set_trace()
+            processed_results = []
+            for mask_cls_result, mask_pred_result, mask_box_result, input_per_image, image_size in zip(
+                mask_cls_results, mask_pred_results, mask_box_results, batched_inputs, images.image_sizes
+            ):  # image_size is augmented size, not divisible to 32
+                height = input_per_image.get("height", image_size[0])  # real size
+                width = input_per_image.get("width", image_size[1])
+                processed_results.append({})
+                new_size = mask_pred_result.shape[-2:]  # padded size (divisible to 32)
+
+                # import ipdb; ipdb.set_trace()
+                if self.sem_seg_postprocess_before_inference:
+                    mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                        mask_pred_result, image_size, height, width
+                    )
+                    mask_cls_result = mask_cls_result.to(mask_pred_result)
+                    # mask_box_result = mask_box_result.to(mask_pred_result)
+                    # mask_box_result = self.box_postprocess(mask_box_result, height, width)
+
+                # semantic segmentation inference
+                if self.semantic_on:
+                    r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
+                    if not self.sem_seg_postprocess_before_inference:
+                        r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                    processed_results[-1]["sem_seg"] = r
+
+                # panoptic segmentation inference
+                if self.panoptic_on:
+                    panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+                # import ipdb; ipdb.set_trace()
+                # instance segmentation inference
+                # import ipdb; ipdb.set_trace()
+                if self.instance_on:
+                    mask_box_result = mask_box_result.to(mask_pred_result)
+                    height = new_size[0]/image_size[0]*height
+                    width = new_size[1]/image_size[1]*width
+                    mask_box_result = self.box_postprocess(mask_box_result, height, width)
+
+                    instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result, mask_box_result)
+                    processed_results[-1]["instances"] = instance_r
+
+            return processed_results
+
+    def prepare_targets(self, targets, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for targets_per_image in targets:
+            # pad gt
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            # print(images.tensor.shape[-2:], image_size_xyxy)
+            # import ipdb; ipdb.set_trace()
+            gt_masks = targets_per_image.gt_masks
+            padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            new_targets.append(
+                {
+                    "labels": targets_per_image.gt_classes,
+                    "masks": padded_masks,
+                    "boxes":box_ops.box_xyxy_to_cxcywh(targets_per_image.gt_boxes.tensor)/image_size_xyxy
+                }
+            )
+        return new_targets
+
+    def prepare_targets_detr(self, targets, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for targets_per_image in targets:
+            # pad gt
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            # print(images.tensor.shape[-2:], image_size_xyxy)
+            # import ipdb; ipdb.set_trace()
+            gt_masks = targets_per_image.gt_masks
+            padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            new_targets.append(
+                {
+                    "labels": targets_per_image.gt_classes,
+                    "masks": padded_masks,
+                    "boxes": box_ops.box_xyxy_to_cxcywh(targets_per_image.gt_boxes.tensor) / image_size_xyxy
+                }
+            )
+        return new_targets
+
+    def semantic_inference(self, mask_cls, mask_pred):
+        # if use cross-entropy loss in training, evaluate with softmax
+        if self.semantic_ce_loss:
+            mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+            mask_pred = mask_pred.sigmoid()
+            semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+            return semseg
+        # if use focal loss in training, evaluate with sigmoid. As sigmoid is mainly for detection and not sharp
+        # enough for semantic and panoptic segmentation, we additionally use use softmax with a temperature to
+        # make the score sharper.
+        else:
+            T = self.pano_temp
+            mask_cls = mask_cls.sigmoid()
+
+            if self.transform_eval:
+                # mask_cls = (mask_cls * 2.5 + 1.0).sigmoid()
+                mask_cls = F.softmax(mask_cls / T, dim=-1)  # already sigmoid
+            mask_pred = mask_pred.sigmoid()
+            semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+            return semseg
+
+    def panoptic_inference(self, mask_cls, mask_pred):
+        # As we use focal loss in training, evaluate with sigmoid. As sigmoid is mainly for detection and not sharp
+        # enough for semantic and panoptic segmentation, we additionally use use softmax with a temperature to
+        # make the score sharper.
+        prob = 0.5
+        T = self.pano_temp
+        scores, labels = mask_cls.sigmoid().max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        # added process
+        if self.transform_eval:
+            scores, labels = F.softmax(mask_cls.sigmoid() / T, dim=-1).max(-1)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+
+        current_segment_id = 0
+
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= prob).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= prob)
+
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+
+            return panoptic_seg, segments_info
+
+    def instance_inference(self, mask_cls, mask_pred, mask_box_result):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        scores = mask_cls.sigmoid()  # [100, 80]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)  # select 100
+        labels_per_image = labels[topk_indices]
+        topk_indices = torch.div(topk_indices, self.sem_seg_head.num_classes,rounding_mode='floor')
+        mask_pred = mask_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # half mask box half pred box
+        mask_box_result = mask_box_result[topk_indices]
+        if self.panoptic_on:
+            mask_box_result = mask_box_result[keep]
+        result.pred_boxes = Boxes(mask_box_result)
+        # Uncomment the following to get boxes from masks (this is slow)
+        # result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        if self.focus_on_box:
+            mask_scores_per_image = 1.0
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result
+
+    def box_postprocess(self, out_bbox, img_h, img_w):
+        # postprocess box height and width
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
+        scale_fct = scale_fct.to(out_bbox)
+        boxes = boxes * scale_fct
+        return boxes
+
+
diff --git a/projects/maskdino/modeling/__init__.py b/projects/maskdino/modeling/__init__.py
new file mode 100644
index 00000000..16d7e5c3
--- /dev/null
+++ b/projects/maskdino/modeling/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+from .backbone.swin import D2SwinTransformer
+from .pixel_decoder.maskdino_encoder import MaskDINOEncoder
+from .meta_arch.maskdino_head import MaskDINOHead
+
diff --git a/projects/maskdino/modeling/backbone/__init__.py b/projects/maskdino/modeling/backbone/__init__.py
new file mode 100644
index 00000000..9020c2df
--- /dev/null
+++ b/projects/maskdino/modeling/backbone/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
diff --git a/projects/maskdino/modeling/backbone/focal.py b/projects/maskdino/modeling/backbone/focal.py
new file mode 100644
index 00000000..4b0b0f67
--- /dev/null
+++ b/projects/maskdino/modeling/backbone/focal.py
@@ -0,0 +1,685 @@
+# --------------------------------------------------------
+# FocalNet for Semantic Segmentation
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Jianwei Yang
+# --------------------------------------------------------
+import math
+import time
+import numpy as np
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+# from util.misc import NestedTensor
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class FocalModulation(nn.Module):
+    """ Focal Modulation
+
+    Args:
+        dim (int): Number of input channels.
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        focal_factor (int, default=2): Step to increase the focal window
+        use_postln (bool, default=False): Whether use post-modulation layernorm
+    """
+
+    def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, focal_factor=2, use_postln=False, 
+        use_postln_in_modulation=False, normalize_modulator=False):
+
+        super().__init__()
+        self.dim = dim
+
+        # specific args for focalv3
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.focal_factor = focal_factor
+        self.use_postln_in_modulation = use_postln_in_modulation
+        self.normalize_modulator = normalize_modulator
+
+        self.f = nn.Linear(dim, 2*dim+(self.focal_level+1), bias=True)
+        self.h = nn.Conv2d(dim, dim, kernel_size=1, stride=1, padding=0, groups=1, bias=True)
+
+        self.act = nn.GELU()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.focal_layers = nn.ModuleList()
+
+        if self.use_postln_in_modulation:
+            self.ln = nn.LayerNorm(dim)
+
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor*k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, groups=dim, 
+                        padding=kernel_size//2, bias=False),
+                    nn.GELU(),
+                    )
+                )
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: input features with shape of (B, H, W, C)
+        """
+        B, nH, nW, C = x.shape
+        x = self.f(x)
+        x = x.permute(0, 3, 1, 2).contiguous()
+        q, ctx, gates = torch.split(x, (C, C, self.focal_level+1), 1)
+        
+        ctx_all = 0
+        for l in range(self.focal_level):                     
+            ctx = self.focal_layers[l](ctx)
+            ctx_all = ctx_all + ctx*gates[:, l:l+1]
+        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global*gates[:,self.focal_level:]
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level+1)
+
+        x_out = q * self.h(ctx_all)
+        x_out = x_out.permute(0, 2, 3, 1).contiguous()
+        if self.use_postln_in_modulation:
+            x_out = self.ln(x_out)            
+        x_out = self.proj(x_out)
+        x_out = self.proj_drop(x_out)
+        return x_out
+
+class FocalModulationBlock(nn.Module):
+    """ Focal Modulation Block.
+
+    Args:
+        dim (int): Number of input channels.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int): number of focal levels
+        focal_window (int): focal kernel size at level 1
+    """
+
+    def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0., 
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 focal_level=2, focal_window=9, 
+                 use_postln=False, use_postln_in_modulation=False, 
+                 normalize_modulator=False, 
+                 use_layerscale=False, 
+                 layerscale_value=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.use_postln = use_postln
+        self.use_layerscale = use_layerscale
+
+        self.norm1 = norm_layer(dim)
+        self.modulation = FocalModulation(
+            dim, focal_window=self.focal_window, focal_level=self.focal_level, proj_drop=drop, 
+            use_postln_in_modulation=use_postln_in_modulation, 
+            normalize_modulator=normalize_modulator, 
+        )            
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.H = None
+        self.W = None
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if self.use_layerscale:
+            self.gamma_1 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if not self.use_postln:
+            x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        
+        # FM
+        x = self.modulation(x).view(B, H * W, C)
+        if self.use_postln:
+            x = self.norm1(x)
+
+        # FFN
+        x = shortcut + self.drop_path(self.gamma_1 * x)
+
+        if self.use_postln:
+            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+
+        return x
+
+class BasicLayer(nn.Module):
+    """ A basic focal modulation layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        use_conv_embed (bool): Use overlapped convolution for patch embedding or now. Default: False
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 focal_window=9, 
+                 focal_level=2, 
+                 use_conv_embed=False,     
+                 use_postln=False,          
+                 use_postln_in_modulation=False, 
+                 normalize_modulator=False, 
+                 use_layerscale=False,                   
+                 use_checkpoint=False
+        ):
+        super().__init__()
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            FocalModulationBlock(
+                dim=dim,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                focal_window=focal_window, 
+                focal_level=focal_level, 
+                use_postln=use_postln, 
+                use_postln_in_modulation=use_postln_in_modulation, 
+                normalize_modulator=normalize_modulator, 
+                use_layerscale=use_layerscale, 
+                norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                patch_size=2, 
+                in_chans=dim, embed_dim=2*dim, 
+                use_conv_embed=use_conv_embed, 
+                norm_layer=norm_layer, 
+                is_stem=False
+            )
+
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x_reshaped = x.transpose(1, 2).view(x.shape[0], x.shape[-1], H, W)
+            x_down = self.downsample(x_reshaped)      
+            x_down = x_down.flatten(2).transpose(1, 2)            
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
+        is_stem (bool): Is the stem block or not. 
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None, use_conv_embed=False, is_stem=False):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7; padding = 2; stride = 4
+            else:
+                kernel_size = 3; padding = 1; stride = 2
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)                    
+        else:
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class FocalNet(nn.Module):
+    """ FocalNet backbone.
+
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Dropout rate.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        focal_levels (Sequence[int]): Number of focal levels at four stages
+        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=1600,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 focal_levels=[2,2,2,2], 
+                 focal_windows=[9,9,9,9],
+                 use_conv_embed=False, 
+                 use_postln=False, 
+                 use_postln_in_modulation=False, 
+                 use_layerscale=False, 
+                 normalize_modulator=False, 
+                 use_checkpoint=False,                  
+        ):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None, 
+            use_conv_embed=use_conv_embed, is_stem=True)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchEmbed if (i_layer < self.num_layers - 1) else None,
+                focal_window=focal_windows[i_layer], 
+                focal_level=focal_levels[i_layer], 
+                use_conv_embed=use_conv_embed,
+                use_postln=use_postln, 
+                use_postln_in_modulation=use_postln_in_modulation, 
+                normalize_modulator=normalize_modulator, 
+                use_layerscale=use_layerscale, 
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        # if isinstance(pretrained, str):
+        #     self.apply(_init_weights)
+        #     logger = get_root_logger()
+        #     load_checkpoint(self, pretrained, strict=False, logger=logger)
+        # elif pretrained is None:
+        #     self.apply(_init_weights)
+        # else:
+        #     raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        # x = tensor_list.tensors
+        tic = time.time()
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        # outs = []
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)            
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                # outs.append(out)
+                outs["res{}".format(i + 2)] = out
+        toc = time.time()
+
+        # # collect for nesttensors
+        # outs_dict = {}
+        # for idx, out_i in enumerate(outs):
+        #     m = tensor_list.mask
+        #     assert m is not None
+        #     mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+        #     outs_dict[idx] = NestedTensor(out_i, mask)
+
+        return outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(FocalNet, self).train(mode)
+        self._freeze_stages()
+
+
+@BACKBONE_REGISTRY.register()
+class D2FocalNet(FocalNet, Backbone):
+    def __init__(self, cfg, input_shape):
+        kw = cfg.MODEL.FOCAL
+        assert kw.modelname in ['focalnet_L_384_22k', 'focalnet_L_384_22k_fl4', 'focalnet_XL_384_22k']
+        kw = cfg.MODEL.FOCAL
+        if 'focal_levels' in kw:
+            kw['focal_levels'] = [kw['focal_levels']] * 4
+
+        if 'focal_windows' in kw:
+            kw['focal_windows'] = [kw['focal_windows']] * 4
+
+        model_para_dict = {
+            'focalnet_L_384_22k': dict(
+                embed_dim=192,
+                depths=[2, 2, 18, 2],
+                focal_levels=kw.get('focal_levels', [3, 3, 3, 3]),
+                focal_windows=kw.get('focal_windows', [5, 5, 5, 5]),
+                use_conv_embed=True,
+                use_postln=True,
+                use_postln_in_modulation=False,
+                use_layerscale=True,
+                normalize_modulator=False,
+            ),
+            'focalnet_L_384_22k_fl4': dict(
+                embed_dim=192,
+                depths=[2, 2, 18, 2],
+                focal_levels=kw.get('focal_levels', [4, 4, 4, 4]),
+                focal_windows=kw.get('focal_windows', [3, 3, 3, 3]),
+                use_conv_embed=True,
+                use_postln=True,
+                use_postln_in_modulation=False,
+                use_layerscale=True,
+                normalize_modulator=True,
+            ),
+            'focalnet_XL_384_22k': dict(
+                embed_dim=256,
+                depths=[2, 2, 18, 2],
+                focal_levels=kw.get('focal_levels', [3, 3, 3, 3]),
+                focal_windows=kw.get('focal_windows', [5, 5, 5, 5]),
+                use_conv_embed=True,
+                use_postln=True,
+                use_postln_in_modulation=False,
+                use_layerscale=True,
+                normalize_modulator=False,
+            ),
+            'focalnet_huge_224_22k': dict(
+                embed_dim=352,
+                depths=[2, 2, 18, 2],
+                focal_levels=kw.get('focal_levels', [3, 3, 3, 3]),
+                focal_windows=kw.get('focal_windows', [5, 5, 5, 5]),
+                use_conv_embed=True,
+                use_postln=True,
+                use_postln_in_modulation=False,
+                use_layerscale=True,
+                normalize_modulator=False,
+            ),
+        }
+
+        kw_cgf = model_para_dict[kw.modelname]
+        kw1 = {k:v for k, v in kw.items() if 'modelname' not in k and 'out_features' not in k}
+        kw_cgf.update(kw1)
+
+        super().__init__(**kw_cgf)
+
+
+        self._out_features = kw.out_features
+
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self):
+        return 32
+
+def build_focalnet(modelname, **kw):
+    assert modelname in ['focalnet_L_384_22k', 'focalnet_L_384_22k_fl4', 'focalnet_XL_384_22k']
+
+    if 'focal_levels' in kw:
+        kw['focal_levels'] = [kw['focal_levels']] * 4
+
+    if 'focal_windows' in kw:
+        kw['focal_windows'] = [kw['focal_windows']] * 4
+
+    model_para_dict = {
+        'focalnet_L_384_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            focal_levels=kw.get('focal_levels', [3, 3, 3, 3]), 
+            focal_windows=kw.get('focal_windows', [5, 5, 5, 5]), 
+            use_conv_embed=True, 
+            use_postln=True, 
+            use_postln_in_modulation=False, 
+            use_layerscale=True, 
+            normalize_modulator=False, 
+        ),
+        'focalnet_L_384_22k_fl4': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            focal_levels=kw.get('focal_levels', [4, 4, 4, 4]), 
+            focal_windows=kw.get('focal_windows', [3, 3, 3, 3]), 
+            use_conv_embed=True, 
+            use_postln=True, 
+            use_postln_in_modulation=False, 
+            use_layerscale=True, 
+            normalize_modulator=True, 
+        ),
+        'focalnet_XL_384_22k': dict(
+            embed_dim=256,
+            depths=[ 2, 2, 18, 2 ],
+            focal_levels=kw.get('focal_levels', [3, 3, 3, 3]), 
+            focal_windows=kw.get('focal_windows', [5, 5, 5, 5]), 
+            use_conv_embed=True, 
+            use_postln=True, 
+            use_postln_in_modulation=False, 
+            use_layerscale=True, 
+            normalize_modulator=False, 
+        ),   
+        'focalnet_huge_224_22k': dict(
+            embed_dim=352,
+            depths=[ 2, 2, 18, 2 ],
+            focal_levels=kw.get('focal_levels', [3, 3, 3, 3]), 
+            focal_windows=kw.get('focal_windows', [5, 5, 5, 5]), 
+            use_conv_embed=True, 
+            use_postln=True, 
+            use_postln_in_modulation=False, 
+            use_layerscale=True, 
+            normalize_modulator=False, 
+        ),                
+    }
+
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = FocalNet(**kw_cgf)
+    return model
\ No newline at end of file
diff --git a/projects/maskdino/modeling/backbone/swin.py b/projects/maskdino/modeling/backbone/swin.py
new file mode 100644
index 00000000..3b099d84
--- /dev/null
+++ b/projects/maskdino/modeling/backbone/swin.py
@@ -0,0 +1,770 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["res{}".format(i + 2)] = out
+
+        return outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+@BACKBONE_REGISTRY.register()
+class D2SwinTransformer(SwinTransformer, Backbone):
+    def __init__(self, cfg, input_shape):
+
+        pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
+        patch_size = cfg.MODEL.SWIN.PATCH_SIZE
+        in_chans = 3
+        embed_dim = cfg.MODEL.SWIN.EMBED_DIM
+        depths = cfg.MODEL.SWIN.DEPTHS
+        num_heads = cfg.MODEL.SWIN.NUM_HEADS
+        window_size = cfg.MODEL.SWIN.WINDOW_SIZE
+        mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
+        qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
+        qk_scale = cfg.MODEL.SWIN.QK_SCALE
+        drop_rate = cfg.MODEL.SWIN.DROP_RATE
+        attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
+        drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
+        norm_layer = nn.LayerNorm
+        ape = cfg.MODEL.SWIN.APE
+        patch_norm = cfg.MODEL.SWIN.PATCH_NORM
+        use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT
+
+        super().__init__(
+            pretrain_img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            depths,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            qk_scale,
+            drop_rate,
+            attn_drop_rate,
+            drop_path_rate,
+            norm_layer,
+            ape,
+            patch_norm,
+            use_checkpoint=use_checkpoint,
+        )
+
+        self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
+
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self):
+        return 32
diff --git a/projects/maskdino/modeling/criterion.py b/projects/maskdino/modeling/criterion.py
new file mode 100644
index 00000000..bcd51fc9
--- /dev/null
+++ b/projects/maskdino/modeling/criterion.py
@@ -0,0 +1,442 @@
+# ------------------------------------------------------------------------
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from DINO https://github.com/IDEA-Research/DINO by Feng Li and Hao Zhang.
+# ------------------------------------------------------------------------
+"""
+MaskFormer criterion.
+"""
+import logging
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.utils.comm import get_world_size
+from detectron2.projects.point_rend.point_features import (
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+)
+
+from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
+from projects.maskdino.utils import box_ops
+
+# from maskdino.maskformer_model import sigmoid_focal_loss
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+
+    return loss.mean(1).sum() / num_boxes
+
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+
+
+dice_loss_jit = torch.jit.script(
+    dice_loss
+)  # type: torch.jit.ScriptModule
+
+
+def sigmoid_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+
+    return loss.mean(1).sum() / num_masks
+
+
+sigmoid_ce_loss_jit = torch.jit.script(
+    sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+
+
+def calculate_uncertainty(logits):
+    """
+    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
+        foreground class in `classes`.
+    Args:
+        logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images and C is
+            the number of foreground classes. The values are logits.
+    Returns:
+        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    assert logits.shape[1] == 1
+    gt_class_logits = logits.clone()
+    return -(torch.abs(gt_class_logits))
+
+
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
+                 num_points, oversample_ratio, importance_sample_ratio,dn="no",dn_losses=[], panoptic_on=False, semantic_ce_loss=False):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        self.dn=dn
+        self.dn_losses=dn_losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+        # pointwise mask loss parameters
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.focal_alpha = 0.25
+
+        self.panoptic_on = panoptic_on
+        self.semantic_ce_loss = semantic_ce_loss
+
+    def loss_labels_ce(self, outputs, targets, indices, num_masks):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"].float()
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {"loss_ce": loss_ce}
+        return losses
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (Binary focal loss)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {'loss_ce': loss_ce}
+
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+
+        return losses
+
+    def loss_boxes_panoptic(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        target_labels = torch.cat([t['labels'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        isthing=target_labels<80
+        target_boxes=target_boxes[isthing]
+        src_boxes=src_boxes[isthing]
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_masks):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # No need to upsample predictions as we are using normalized coordinates :)
+        # N x 1 x H x W
+        src_masks = src_masks[:, None]
+        target_masks = target_masks[:, None]
+
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+
+        losses = {
+            "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
+        }
+
+        del src_masks
+        del target_masks
+        return losses
+
+    def prep_for_dn(self,mask_dict):
+        output_known_lbs_bboxes = mask_dict['output_known_lbs_bboxes']
+
+        known_indice = mask_dict['known_indice']
+        scalar,pad_size=mask_dict['scalar'],mask_dict['pad_size']
+        assert pad_size % scalar==0
+        single_pad=pad_size//scalar
+
+        num_tgt = known_indice.numel()
+        return output_known_lbs_bboxes,num_tgt,single_pad,scalar
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_masks):
+        loss_map = {
+            'labels': self.loss_labels_ce if self.semantic_ce_loss else self.loss_labels,
+            'masks': self.loss_masks,
+            'boxes': self.loss_boxes_panoptic if self.panoptic_on else self.loss_boxes,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_masks)
+
+    def forward(self, outputs, targets,mask_dict=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        if self.dn is not "no" and mask_dict is not None:
+            output_known_lbs_bboxes,num_tgt,single_pad,scalar = self.prep_for_dn(mask_dict)
+            exc_idx = []
+            for i in range(len(targets)):
+                if len(targets[i]['labels']) > 0:
+                    t = torch.arange(0, len(targets[i]['labels'])).long().cuda()
+                    t = t.unsqueeze(0).repeat(scalar, 1)
+                    tgt_idx = t.flatten()
+                    output_idx = (torch.tensor(range(scalar)) * single_pad).long().cuda().unsqueeze(1) + t
+                    output_idx = output_idx.flatten()
+                else:
+                    output_idx = tgt_idx = torch.tensor([]).long().cuda()
+                exc_idx.append((output_idx, tgt_idx))
+        indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
+
+        if self.dn != "no" and mask_dict is not None:
+            l_dict={}
+            for loss in self.dn_losses:
+                l_dict.update(self.get_loss(loss, output_known_lbs_bboxes, targets, exc_idx, num_masks*scalar))
+            l_dict = {k + f'_dn': v for k, v in l_dict.items()}
+            losses.update(l_dict)
+            # import pdb;pdb.set_trace()
+        elif self.dn != "no":
+            l_dict = dict()
+            l_dict['loss_bbox_dn'] = torch.as_tensor(0.).to('cuda')
+            l_dict['loss_giou_dn'] = torch.as_tensor(0.).to('cuda')
+            l_dict['loss_ce_dn'] = torch.as_tensor(0.).to('cuda')
+            if self.dn == "seg":
+                l_dict['loss_mask_dn'] = torch.as_tensor(0.).to('cuda')
+                l_dict['loss_dice_dn'] = torch.as_tensor(0.).to('cuda')
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+                if 'interm_outputs' in outputs:
+                    start = 0
+                else:
+                    start = 1
+                if i>=start:
+                # if i>=1:
+                    if self.dn != "no" and mask_dict is not None:
+                        out_=output_known_lbs_bboxes['aux_outputs'][i]
+                        l_dict = {}
+                        for loss in self.dn_losses:
+                            l_dict.update(
+                                self.get_loss(loss, out_, targets, exc_idx, num_masks * scalar))
+                        l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
+                        losses.update(l_dict)
+                        # import pdb;pdb.set_trace()
+                    elif self.dn != "no":
+                        l_dict = dict()
+                        l_dict[f'loss_bbox_dn_{i}'] = torch.as_tensor(0.).to('cuda')
+                        l_dict[f'loss_giou_dn_{i}'] = torch.as_tensor(0.).to('cuda')
+                        l_dict[f'loss_ce_dn_{i}'] = torch.as_tensor(0.).to('cuda')
+                        if self.dn == "seg":
+                            l_dict[f'loss_mask_dn_{i}'] = torch.as_tensor(0.).to('cuda')
+                            l_dict[f'loss_dice_dn_{i}'] = torch.as_tensor(0.).to('cuda')
+                        losses.update(l_dict)
+        # interm_outputs loss
+        if 'interm_outputs' in outputs:
+            interm_outputs = outputs['interm_outputs']
+            indices = self.matcher(interm_outputs, targets)  # cost=["cls", "box"]
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, interm_outputs, targets, indices, num_masks)
+                l_dict = {k + f'_interm': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+    def __repr__(self):
+        head = "Criterion " + self.__class__.__name__
+        body = [
+            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
+            "losses: {}".format(self.losses),
+            "weight_dict: {}".format(self.weight_dict),
+            "num_classes: {}".format(self.num_classes),
+            "eos_coef: {}".format(self.eos_coef),
+            "num_points: {}".format(self.num_points),
+            "oversample_ratio: {}".format(self.oversample_ratio),
+            "importance_sample_ratio: {}".format(self.importance_sample_ratio),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)
diff --git a/projects/maskdino/modeling/matcher.py b/projects/maskdino/modeling/matcher.py
new file mode 100644
index 00000000..0aa69a45
--- /dev/null
+++ b/projects/maskdino/modeling/matcher.py
@@ -0,0 +1,233 @@
+# ------------------------------------------------------------------------
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from DINO https://github.com/IDEA-Research/DINO by Feng Li and Hao Zhang.
+# ------------------------------------------------------------------------
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+
+from detectron2.projects.point_rend.point_features import point_sample
+from projects.maskdino.utils.box_ops import generalized_box_iou,box_cxcywh_to_xyxy
+import random
+def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+batch_dice_loss_jit = torch.jit.script(
+    batch_dice_loss
+)  # type: torch.jit.ScriptModule
+
+
+def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+
+    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
+        "nc,mc->nm", neg, (1 - targets)
+    )
+
+    return loss / hw
+
+
+batch_sigmoid_ce_loss_jit = torch.jit.script(
+    batch_sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+
+
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0,cost_box=0,cost_giou=0, panoptic_on=False):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        self.cost_box = cost_box
+        self.cost_giou = cost_giou
+
+        self.panoptic_on = panoptic_on
+
+        assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
+
+        self.num_points = num_points
+
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets, cost=["cls", "box", "mask"]):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        indices = []
+        # if random.randint(1,1000)<10:
+        #     print("cost match", cost)
+
+        # Iterate through batch size
+        for b in range(bs):
+            out_bbox = outputs["pred_boxes"][b]
+            if 'box' in cost:
+                # out_bbox=outputs["pred_boxes"][b]
+                tgt_bbox=targets[b]["boxes"]
+                cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+                cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+            else:
+                cost_bbox = torch.tensor(0).to(out_bbox)
+                cost_giou = torch.tensor(0).to(out_bbox)
+
+            # out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
+            out_prob = outputs["pred_logits"][b].sigmoid()  # [num_queries, num_classes]
+            tgt_ids = targets[b]["labels"]
+            # focal loss
+            alpha = 0.25
+            gamma = 2.0
+            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            # cost_class = -out_prob[:, tgt_ids]
+            if 'mask' in cost:
+                out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+                # gt masks are already padded when preparing target
+                tgt_mask = targets[b]["masks"].to(out_mask)
+
+                out_mask = out_mask[:, None]
+                tgt_mask = tgt_mask[:, None]
+                # all masks share the same set of points for efficient matching!
+                point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
+                # get gt labels
+                tgt_mask = point_sample(
+                    tgt_mask,
+                    point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                    align_corners=False,
+                ).squeeze(1)
+
+                out_mask = point_sample(
+                    out_mask,
+                    point_coords.repeat(out_mask.shape[0], 1, 1),
+                    align_corners=False,
+                ).squeeze(1)
+
+                with autocast(enabled=False):
+                    out_mask = out_mask.float()
+                    tgt_mask = tgt_mask.float()
+                    # Compute the focal loss between masks
+                    cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+
+                    # Compute the dice loss betwen masks
+                    cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            else:
+                cost_mask = torch.tensor(0).to(out_bbox)
+                cost_dice = torch.tensor(0).to(out_bbox)
+            
+            # Final cost matrix
+            if self.panoptic_on:
+                isthing=tgt_ids<80
+                cost_bbox[:,~isthing]=cost_bbox[:,isthing].mean()
+                cost_giou[:,~isthing]=cost_giou[:,isthing].mean()
+                cost_bbox[cost_bbox.isnan()]=0.0
+                cost_giou[cost_giou.isnan()]=0.0
+                # cost_class[:,~isthing]=cost_class[:,~isthing]/2.0
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+                + self.cost_box*cost_bbox
+                +self.cost_giou*cost_giou
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            # C = (
+            #     self.cost_mask * cost_mask
+            #     + self.cost_class * cost_class
+            #     + self.cost_dice * cost_dice
+            #     + self.cost_box*cost_bbox
+            #     +self.cost_giou*cost_giou
+            # )
+            # C = C.reshape(num_queries, -1).cpu()
+
+            indices.append(linear_sum_assignment(C))
+
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+
+    @torch.no_grad()
+    def forward(self, outputs, targets, cost=["cls", "box", "mask"]):
+        """Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        return self.memory_efficient_forward(outputs, targets, cost)
+
+    def __repr__(self, _repr_indent=4):
+        head = "Matcher " + self.__class__.__name__
+        body = [
+            "cost_class: {}".format(self.cost_class),
+            "cost_mask: {}".format(self.cost_mask),
+            "cost_dice: {}".format(self.cost_dice),
+        ]
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)
diff --git a/projects/maskdino/modeling/meta_arch/__init__.py b/projects/maskdino/modeling/meta_arch/__init__.py
new file mode 100644
index 00000000..c05f75a0
--- /dev/null
+++ b/projects/maskdino/modeling/meta_arch/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+
diff --git a/projects/maskdino/modeling/meta_arch/maskdino_head.py b/projects/maskdino/modeling/meta_arch/maskdino_head.py
new file mode 100644
index 00000000..8dc3291b
--- /dev/null
+++ b/projects/maskdino/modeling/meta_arch/maskdino_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
+# ------------------------------------------------------------------------------
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from ..transformer_decoder.maskdino_decoder import build_transformer_decoder
+from ..pixel_decoder.maskdino_encoder import build_pixel_decoder
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskDINOHead(nn.Module):
+
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+
+        self.num_classes = num_classes
+
+    def forward(self, features, mask=None,targets=None):
+        return self.layers(features, mask,targets=targets)
+
+    def layers(self, features, mask=None,targets=None):
+        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask)
+
+        predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets)
+
+        return predictions
diff --git a/projects/maskdino/modeling/pixel_decoder/__init__.py b/projects/maskdino/modeling/pixel_decoder/__init__.py
new file mode 100644
index 00000000..0ff5bdb5
--- /dev/null
+++ b/projects/maskdino/modeling/pixel_decoder/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
diff --git a/projects/maskdino/modeling/pixel_decoder/maskdino_encoder.py b/projects/maskdino/modeling/pixel_decoder/maskdino_encoder.py
new file mode 100644
index 00000000..e25d0001
--- /dev/null
+++ b/projects/maskdino/modeling/pixel_decoder/maskdino_encoder.py
@@ -0,0 +1,403 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
+import logging
+import numpy as np
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from torch.cuda.amp import autocast
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from .position_encoding import PositionEmbeddingSine
+from ...utils.utils import _get_clones, _get_activation_fn
+# from .ops.modules import MSDeformAttn
+from detrex.layers import MultiScaleDeformableAttention
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.MaskDINO.PIXEL_DECODER_NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+
+# MSDeformAttn Transformer encoder in deformable detr
+class MSDeformAttnTransformerEncoderOnly(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu",
+                 num_feature_levels=4, enc_n_points=4,):
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward,
+                                                            dropout, activation,
+                                                            num_feature_levels, nhead, enc_n_points)
+        self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers)
+
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        normal_(self.level_embed)
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def forward(self, srcs, masks, pos_embeds):
+        # import ipdb; ipdb.set_trace()
+        enable_mask=0
+        if masks is not None:
+            for src in srcs:
+                if src.size(2)%32 or src.size(3)%32:
+                    enable_mask = 1
+        if enable_mask==0:
+            masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+
+        return memory, spatial_shapes, level_start_index
+
+
+class MSDeformAttnTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MultiScaleDeformableAttention(
+            embed_dim=d_model, num_levels=n_levels,
+            num_heads=n_heads, num_points=n_points,
+            batch_first = True,dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
+        # self attention
+        src2 = self.self_attn(query=src,query_pos=pos, reference_points=reference_points, value=src,spatial_shapes= spatial_shapes, level_start_index=level_start_index, key_padding_mask=padding_mask)
+        src =src2
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class MSDeformAttnTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+
+        return output
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskDINOEncoder(nn.Module):
+    """
+    This is the multi-scale encoder in detection models, also named as pixel decoder in segmentation models.
+    """
+    # @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+        # deformable transformer encoder args
+        transformer_in_features: List[str],
+        common_stride: int,
+        num_feature_levels: int,
+        total_num_feature_levels: int,
+        feature_order: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+        transformer_input_shape = {
+            k: v for k, v in input_shape.items() if k in transformer_in_features
+        }
+        # this is the input shape of pixel decoder
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        self.feature_strides = [v.stride for k, v in input_shape]
+        self.feature_channels = [v.channels for k, v in input_shape]
+        self.feature_order = feature_order
+        
+        # this is the input shape of transformer encoder (could use less features than pixel decoder
+
+        if feature_order == "low2high":
+            transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: -x[1].stride)
+        else:
+            transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
+        self.transformer_in_features = [k for k, v in transformer_input_shape]  # starting from "res2" to "res5"
+        transformer_in_channels = [v.channels for k, v in transformer_input_shape]
+        self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape]  # to decide extra FPN layers
+
+        self.maskdino_num_feature_levels = num_feature_levels  # always use 3 scales
+        self.total_num_feature_levels = total_num_feature_levels
+        self.common_stride = common_stride
+
+        self.transformer_num_feature_levels = len(self.transformer_in_features)
+        self.low_resolution_index = transformer_in_channels.index(max(transformer_in_channels))
+        self.high_resolution_index = 0 if self.feature_order == 'low2high' else -1
+        if self.transformer_num_feature_levels > 1:
+            input_proj_list = []
+            # from low resolution to high resolution (res5 -> res2)
+            for in_channels in transformer_in_channels[::-1]:
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                ))
+            # downsample
+            in_channels = max(transformer_in_channels)
+            for _ in range(self.total_num_feature_levels - self.transformer_num_feature_levels):  # exclude the res2
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, conv_dim, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, conv_dim),
+                ))
+                in_channels = conv_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                )])
+
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+        self.transformer = MSDeformAttnTransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            num_feature_levels=self.total_num_feature_levels,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        self.mask_dim = mask_dim
+        # use 1x1 conv instead
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
+
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
+            lateral_norm = get_norm(norm, conv_dim)
+            output_norm = get_norm(norm, conv_dim)
+
+            lateral_conv = Conv2d(
+                in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                conv_dim,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+            self.add_module("layer_{}".format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+    @autocast(enabled=False)
+    def forward_features(self, features, masks):
+        srcsl = []
+        srcs = []
+        posl = []
+        pos = []
+        if self.total_num_feature_levels > self.transformer_num_feature_levels:
+            smallest_feat = features[self.transformer_in_features[self.low_resolution_index]].float()
+            _len_srcs = self.transformer_num_feature_levels
+            for l in range(_len_srcs, self.total_num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](smallest_feat)
+                else:
+                    src = self.input_proj[l](srcsl[-1])
+                srcsl.append(src)
+                posl.append(self.pe_layer(src))
+        srcsl = srcsl[::-1]
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.transformer_in_features[::-1]):
+            x = features[f].float()  # deformable detr does not support half precision
+            srcs.append(self.input_proj[idx](x))
+            pos.append(self.pe_layer(x))
+        # import ipdb; ipdb.set_trace()
+        srcs.extend(srcsl) if self.feature_order == 'low2high' else srcsl.extend(srcs)
+        pos.extend(posl) if self.feature_order == 'low2high' else posl.extend(pos)
+        if self.feature_order != 'low2high':
+            srcs = srcsl
+            pos = posl
+        # import ipdb; ipdb.set_trace()
+        y, spatial_shapes, level_start_index = self.transformer(srcs, masks, pos)
+        bs = y.shape[0]
+
+        split_size_or_sections = [None] * self.total_num_feature_levels
+        # import ipdb; ipdb.set_trace()
+        for i in range(self.total_num_feature_levels):
+            if i < self.total_num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
+
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
+            x = features[f].float()
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(x)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + F.interpolate(out[self.high_resolution_index], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
+            y = output_conv(y)
+            out.append(y)
+        # import ipdb; ipdb.set_trace()
+        for o in out:
+            if num_cur_levels < self.total_num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+        # import ipdb; ipdb.set_trace()
+        return self.mask_features(out[-1]), out[0], multi_scale_features
+
diff --git a/projects/maskdino/modeling/pixel_decoder/position_encoding.py b/projects/maskdino/modeling/pixel_decoder/position_encoding.py
new file mode 100644
index 00000000..95f8abe0
--- /dev/null
+++ b/projects/maskdino/modeling/pixel_decoder/position_encoding.py
@@ -0,0 +1,64 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
+"""
+Various positional encodings for the transformer.
+"""
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (torch.div(dim_t, 2,rounding_mode='trunc')) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self, _repr_indent=4):
+        head = "Positional encoding " + self.__class__.__name__
+        body = [
+            "num_pos_feats: {}".format(self.num_pos_feats),
+            "temperature: {}".format(self.temperature),
+            "normalize: {}".format(self.normalize),
+            "scale: {}".format(self.scale),
+        ]
+        # _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)
diff --git a/projects/maskdino/modeling/transformer_decoder/__init__.py b/projects/maskdino/modeling/transformer_decoder/__init__.py
new file mode 100644
index 00000000..a49f91a6
--- /dev/null
+++ b/projects/maskdino/modeling/transformer_decoder/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+from .maskdino_decoder import MaskDINODecoder
+
diff --git a/projects/maskdino/modeling/transformer_decoder/dino_decoder.py b/projects/maskdino/modeling/transformer_decoder/dino_decoder.py
new file mode 100644
index 00000000..8eca9bed
--- /dev/null
+++ b/projects/maskdino/modeling/transformer_decoder/dino_decoder.py
@@ -0,0 +1,289 @@
+# ------------------------------------------------------------------------
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from DINO https://github.com/IDEA-Research/DINO by Feng Li and Hao Zhang.
+# ------------------------------------------------------------------------
+
+from typing import Optional, List, Union
+import torch
+from torch import nn, Tensor
+from torch.cuda.amp import autocast
+
+from ...utils.utils import MLP, _get_clones, _get_activation_fn, gen_sineembed_for_position, inverse_sigmoid
+from detrex.layers import MultiScaleDeformableAttention
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None,
+                 return_intermediate=False,
+                 d_model=256, query_dim=4,
+                 modulate_hw_attn=True,
+                 num_feature_levels=1,
+                 deformable_decoder=True,
+                 decoder_query_perturber=None,
+                 dec_layer_number=None,  # number of queries each layer in decoder
+                 rm_dec_query_scale=True,
+                 dec_layer_share=False,
+                 dec_layer_dropout_prob=None,
+                 ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+                # for memory
+                level_start_index: Optional[Tensor] = None,  # num_levels
+                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                valid_ratios: Optional[Tensor] = None,
+
+                ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+        output = tgt
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+
+        for layer_id, layer in enumerate(self.layers):
+            # preprocess ref points
+            if self.training and self.decoder_query_perturber is not None and layer_id != 0:
+                reference_points = self.decoder_query_perturber(reference_points)
+
+            reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2
+
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask
+            )
+
+            # iter update
+            if self.bbox_embed is not None:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+
+                reference_points = new_reference_points.detach()
+                # if layer_id != self.num_layers - 1:
+                ref_points.append(new_reference_points)
+
+            intermediate.append(self.norm(output))
+
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
+        ]
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_deformable_box_attn=False,
+                 key_aware_type=None,
+                 ):
+        super().__init__()
+
+        # cross attention
+        if use_deformable_box_attn:
+            raise NotImplementedError
+        else:
+            self.cross_attn = MultiScaleDeformableAttention(
+                embed_dim=d_model, num_levels=n_levels,
+                num_heads=n_heads, num_points=n_points,
+                batch_first=True,dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_type = key_aware_type
+        self.key_aware_proj = None
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    @autocast(enabled=False)
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None,  # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None,  # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None,  # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+                ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        # self attention
+        if self.self_attn is not None:
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+        # cross attention
+        if self.key_aware_type is not None:
+            if self.key_aware_type == 'mean':
+                tgt = tgt + memory.mean(0, keepdim=True)
+            elif self.key_aware_type == 'proj_mean':
+                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
+            else:
+                raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
+        tgt2 = self.cross_attn(query=tgt.transpose(0, 1), query_pos=tgt_query_pos.transpose(0, 1),
+                               reference_points=tgt_reference_points.transpose(0, 1).contiguous(),
+                               value=memory.transpose(0, 1), spatial_shapes=memory_spatial_shapes, level_start_index=memory_level_start_index,
+                               key_padding_mask=memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt2
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+# def _get_clones(module, N, layer_share=False):
+#     # import ipdb; ipdb.set_trace()
+#     if layer_share:
+#         return nn.ModuleList([module for i in range(N)])
+#     else:
+#         return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+#
+#
+# def _get_activation_fn_(activation):
+#     """Return an activation function given a string"""
+#     if activation == "relu":
+#         return F.relu
+#     if activation == "gelu":
+#         return F.gelu
+#     if activation == "glu":
+#         return F.glu
+#     raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+
diff --git a/projects/maskdino/modeling/transformer_decoder/maskdino_decoder.py b/projects/maskdino/modeling/transformer_decoder/maskdino_decoder.py
new file mode 100644
index 00000000..78017a28
--- /dev/null
+++ b/projects/maskdino/modeling/transformer_decoder/maskdino_decoder.py
@@ -0,0 +1,480 @@
+# Copyright (c) IDEA, Inc. and its affiliates.
+# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
+import logging
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+from detectron2.utils.registry import Registry
+from detectron2.structures import BitMasks
+
+from .dino_decoder import TransformerDecoder, DeformableTransformerDecoderLayer
+from ...utils.utils import MLP, gen_encoder_output_proposals, inverse_sigmoid
+from ...utils import box_ops
+
+
+TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
+TRANSFORMER_DECODER_REGISTRY.__doc__ = """
+Registry for transformer module in MaskDINO.
+"""
+
+
+def build_transformer_decoder(cfg, in_channels, mask_classification=True):
+    """
+    Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
+    """
+    name = cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME
+    return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
+
+
+@TRANSFORMER_DECODER_REGISTRY.register()
+class MaskDINODecoder(nn.Module):
+
+    def __init__(
+            self,
+            in_channels,
+            mask_classification=True,
+            *,
+            num_classes: int,
+            hidden_dim: int,
+            num_queries: int,
+            nheads: int,
+            dim_feedforward: int,
+            dec_layers: int,
+            mask_dim: int,
+            enforce_input_project: bool,
+            two_stage: bool,
+            dn: str,
+            noise_scale:float,
+            dn_num:int,
+            initialize_box_type:bool,
+            initial_pred:bool,
+            learn_tgt: bool,
+            total_num_feature_levels: int = 4,
+            dropout: float = 0.0,
+            activation: str = 'relu',
+            nhead: int = 8,
+            dec_n_points: int = 4,
+            return_intermediate_dec: bool = True,
+            query_dim: int = 4,
+            dec_layer_share: bool = False,
+            semantic_ce_loss: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+            d_model: transformer dimension
+            dim_feedforward: feed forward hidden dimension
+            dropout: dropout rate
+            activation: activation function
+            nhead: num heads in multi-head attention
+            dec_n_points: number of sampling points in decoder
+            return_intermediate_dec: return the intermediate results of decoder
+            query_dim: 4 -> (x, y, w, h)
+            dec_layer_share: whether to share each decoder layer
+        """
+        super().__init__()
+
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        self.num_feature_levels = total_num_feature_levels
+        self.initial_pred = initial_pred
+
+        # define Transformer decoder here
+        self.dn=dn
+        self.learn_tgt = learn_tgt
+        self.noise_scale=noise_scale
+        self.dn_num=dn_num
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.two_stage=two_stage
+        self.initialize_box_type = initialize_box_type
+        self.total_num_feature_levels = total_num_feature_levels
+
+        self.num_queries = num_queries
+        self.semantic_ce_loss = semantic_ce_loss
+        # learnable query features
+        if not two_stage or self.learn_tgt:
+            self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        if not two_stage and initialize_box_type == 'no':
+            self.query_embed = nn.Embedding(num_queries, 4)
+        if two_stage:
+            self.enc_output = nn.Linear(hidden_dim, hidden_dim)
+            self.enc_output_norm = nn.LayerNorm(hidden_dim)
+
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.num_classes=num_classes
+        # output FFNs
+        assert self.mask_classification, "why not class embedding?"
+        if self.mask_classification:
+            if self.semantic_ce_loss:
+                self.class_embed = nn.Linear(hidden_dim, num_classes+1)
+            else:
+                self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.label_enc=nn.Embedding(num_classes,hidden_dim)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+
+        # init decoder
+        self.decoder_norm = decoder_norm = nn.LayerNorm(hidden_dim)
+        decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, dim_feedforward,
+                                                          dropout, activation,
+                                                          self.num_feature_levels, nhead, dec_n_points)
+        self.decoder = TransformerDecoder(decoder_layer, self.num_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=hidden_dim, query_dim=query_dim,
+                                          num_feature_levels=self.num_feature_levels,
+                                          dec_layer_share=dec_layer_share,
+                                          )
+
+        self.hidden_dim = hidden_dim
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        box_embed_layerlist = [_bbox_embed for i in range(self.num_layers)]  # share box prediction each layer
+        bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.decoder.bbox_embed = bbox_embed
+
+    def prepare_for_dn(self, targets, tgt, refpoint_emb, batch_size):
+        if self.training:
+            scalar, noise_scale = self.dn_num,self.noise_scale
+
+            known = [(torch.ones_like(t['labels'])).cuda() for t in targets]
+            know_idx = [torch.nonzero(t) for t in known]
+            known_num = [sum(k) for k in known]
+
+            if max(known_num)>0:
+                scalar=scalar//(int(max(known_num)))
+            else:
+                scalar=0
+            if scalar==0:
+                input_query_label = None
+                input_query_bbox = None
+                attn_mask = None
+                mask_dict = None
+
+                return input_query_label, input_query_bbox, attn_mask, mask_dict
+
+            unmask_bbox = unmask_label = torch.cat(known)
+            labels = torch.cat([t['labels'] for t in targets])
+            boxes = torch.cat([t['boxes'] for t in targets])
+            batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
+
+            # 知道label
+            known_label_indice = torch.nonzero(unmask_label)
+            known_label_indice = known_label_indice.view(-1)
+
+            # 知道bbox
+            known_bbox_indice = torch.nonzero(unmask_bbox)
+            known_bbox_indice = known_bbox_indice.view(-1)
+
+            # 知道其中一个
+            known_indice = torch.nonzero(unmask_label + unmask_bbox)
+            known_indice = known_indice.view(-1)
+
+            # 多加noise
+            known_indice = known_indice.repeat(scalar, 1).view(-1)
+            known_labels = labels.repeat(scalar, 1).view(-1)
+            known_bid = batch_idx.repeat(scalar, 1).view(-1)
+            known_bboxs = boxes.repeat(scalar, 1)
+            known_labels_expaned = known_labels.clone()
+            known_bbox_expand = known_bboxs.clone()
+
+            ############ noise on the label
+            if noise_scale > 0:
+                p = torch.rand_like(known_labels_expaned.float())
+                chosen_indice = torch.nonzero(p < (noise_scale * 0.5)).view(-1)  # half of bbox prob
+                new_label = torch.randint_like(chosen_indice, 0, self.num_classes)  # randomly put a new one here
+                known_labels_expaned.scatter_(0, chosen_indice, new_label)
+            if noise_scale > 0:
+                diff = torch.zeros_like(known_bbox_expand)
+                diff[:, :2] = known_bbox_expand[:, 2:] / 2
+                diff[:, 2:] = known_bbox_expand[:, 2:]
+                known_bbox_expand += torch.mul((torch.rand_like(known_bbox_expand) * 2 - 1.0),
+                                               diff).cuda() * noise_scale
+                # known_bbox_expand+=(torch.rand_like(known_bbox_expand)*2-1.0)*torch.tensor([[1,1,0.1,0.1]]).cuda()*noise_scale
+                known_bbox_expand = known_bbox_expand.clamp(min=0.0, max=1.0)
+
+            m = known_labels_expaned.long().to('cuda')
+            input_label_embed = self.label_enc(m)
+            input_bbox_embed = inverse_sigmoid(known_bbox_expand)
+
+            single_pad = int(max(known_num))
+
+            pad_size = int(single_pad * scalar)
+
+            padding_label = torch.zeros(pad_size, self.hidden_dim).cuda()
+            padding_bbox = torch.zeros(pad_size, 4).cuda()
+
+            if not refpoint_emb is None:
+                input_query_label = torch.cat([padding_label, tgt], dim=0).repeat(batch_size, 1, 1)
+                input_query_bbox = torch.cat([padding_bbox, refpoint_emb], dim=0).repeat(batch_size, 1, 1)
+            else:
+                input_query_label=padding_label.repeat(batch_size, 1, 1)
+                input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
+
+            # 按顺序map
+            map_known_indice = torch.tensor([]).to('cuda')
+            if len(known_num):
+                map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
+                map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(scalar)]).long()
+            # map_known_indice.append(list(range))
+            if len(known_bid):
+                # known_bid: [1,1,2,2,2,   1,1,2,2,2]
+                input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
+                # map to [1,2,-，4,5，-;,1,2,3,4,5,6;]
+                input_query_bbox[(known_bid.long(), map_known_indice)] = input_bbox_embed
+
+            tgt_size = pad_size + self.num_queries
+            attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
+            # match query cannot see the reconstruct
+            attn_mask[pad_size:, :pad_size] = True
+            # reconstruct cannot see each other
+            for i in range(scalar):
+                if i == 0:
+                    attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True
+                if i == scalar - 1:
+                    attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True
+                else:
+                    attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True
+                    attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True
+            mask_dict = {
+                'known_indice': torch.as_tensor(known_indice).long(),
+                'batch_idx': torch.as_tensor(batch_idx).long(),
+                'map_known_indice': torch.as_tensor(map_known_indice).long(),
+                'known_lbs_bboxes': (known_labels, known_bboxs),
+                'know_idx': know_idx,
+                'pad_size': pad_size,
+                'scalar': scalar,
+            }
+        else:
+            if not refpoint_emb is None:
+                input_query_label = tgt.repeat(batch_size, 1, 1)
+                input_query_bbox = refpoint_emb.repeat(batch_size, 1, 1)
+            else:
+                input_query_label=None
+                input_query_bbox=None
+            attn_mask = None
+            mask_dict=None
+
+        # 100*batch*256
+        if not input_query_bbox is None:
+            input_query_label = input_query_label
+            input_query_bbox = input_query_bbox
+
+        return input_query_label,input_query_bbox,attn_mask,mask_dict
+
+    def dn_post_process(self,outputs_class,outputs_coord,mask_dict,outputs_mask):
+        assert mask_dict['pad_size'] > 0
+        output_known_class = outputs_class[:, :, :mask_dict['pad_size'], :]
+        outputs_class = outputs_class[:, :, mask_dict['pad_size']:, :]
+        output_known_coord = outputs_coord[:, :, :mask_dict['pad_size'], :]
+        outputs_coord = outputs_coord[:, :, mask_dict['pad_size']:, :]
+        if outputs_mask is not None:
+            output_known_mask = outputs_mask[:, :, :mask_dict['pad_size'], :]
+            outputs_mask = outputs_mask[:, :, mask_dict['pad_size']:, :]
+        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1],'pred_masks': output_known_mask[-1]}
+
+        out['aux_outputs'] = self._set_aux_loss(output_known_class, output_known_mask,output_known_coord)
+        mask_dict['output_known_lbs_bboxes']=out
+        return outputs_class, outputs_coord, outputs_mask
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def pred_box(self,reference, hs, ref0=None):
+        if ref0 is None:
+            outputs_coord_list = []
+        else:
+            outputs_coord_list = [ref0]
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(zip(reference[:-1], self.decoder.bbox_embed, hs)):
+            layer_delta_unsig = layer_bbox_embed(layer_hs)
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+            layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+            outputs_coord_list.append(layer_outputs_unsig)
+        outputs_coord_list = torch.stack(outputs_coord_list)
+        return outputs_coord_list
+
+    def forward(self, x, mask_features, masks, targets=None):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        size_list = []
+
+        # disable mask, it does not affect performance
+        enable_mask = 0
+        if masks is not None:
+            for src in x:
+                if src.size(2) % 32 or src.size(3) % 32:
+                    enable_mask = 1
+        if enable_mask == 0:
+            masks = [torch.zeros((src.size(0), src.size(2), src.size(3)), device=src.device, dtype=torch.bool) for src in x]
+        src_flatten = []
+        mask_flatten = []
+        spatial_shapes = []
+        for i in range(self.num_feature_levels):
+            idx=self.num_feature_levels-1-i
+            bs, c , h, w=x[idx].shape
+            size_list.append(x[i].shape[-2:])
+            spatial_shapes.append(x[idx].shape[-2:])
+            src_flatten.append(self.input_proj[idx](x[idx]).flatten(2).transpose(1, 2))
+            mask_flatten.append(masks[i].flatten(1))
+            # flatten NxCxHxW to HWxNxC
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        predictions_class = []
+        predictions_mask = []
+        if self.two_stage:
+            output_memory, output_proposals = gen_encoder_output_proposals(src_flatten, mask_flatten, spatial_shapes)
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+            enc_outputs_class_unselected = self.class_embed(output_memory)
+            enc_outputs_coord_unselected = self.decoder.bbox_embed[0](
+                output_memory) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(enc_outputs_class_unselected.max(-1)[0], topk, dim=1)[1]
+            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
+                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed = refpoint_embed_undetach.detach()
+
+            tgt_undetach = torch.gather(output_memory, 1,
+                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.hidden_dim))  # unsigmoid
+
+            outputs_class, outputs_mask = self.forward_prediction_heads(tgt_undetach.transpose(0, 1), mask_features)
+            tgt = tgt_undetach.detach()
+            if self.learn_tgt:
+                tgt = self.query_feat.weight[None].repeat(bs, 1, 1)
+            interm_outputs=dict()
+            interm_outputs['pred_logits'] = outputs_class
+            interm_outputs['pred_boxes'] = refpoint_embed_undetach.sigmoid()
+            interm_outputs['pred_masks'] = outputs_mask
+
+            if self.initialize_box_type != 'no':
+                # convert masks into boxes to better initialize box in the decoder
+                assert self.initial_pred
+                flaten_mask = outputs_mask.detach().flatten(0, 1)
+                h, w = outputs_mask.shape[-2:]
+                if self.initialize_box_type == 'bitmask':  # slower, but more accurate
+                    refpoint_embed = BitMasks(flaten_mask > 0).get_bounding_boxes().tensor.cuda()
+                elif self.initialize_box_type == 'mask2box':  # faster conversion
+                    refpoint_embed = box_ops.masks_to_boxes(flaten_mask > 0).cuda()
+                else:
+                    assert NotImplementedError
+                refpoint_embed = box_ops.box_xyxy_to_cxcywh(refpoint_embed) / torch.as_tensor([w, h, w, h],
+                                                                                              dtype=torch.float).cuda()
+                refpoint_embed = refpoint_embed.reshape(outputs_mask.shape[0], outputs_mask.shape[1], 4)
+                refpoint_embed = inverse_sigmoid(refpoint_embed)
+        elif not self.two_stage:
+            tgt = self.query_feat.weight[None].repeat(bs, 1, 1)
+            refpoint_embed = self.query_embed.weight[None].repeat(bs, 1, 1)
+
+        tgt_mask = None
+        mask_dict = None
+        if self.dn != "no" and self.training:
+            assert targets is not None
+            input_query_label, input_query_bbox, tgt_mask, mask_dict = \
+                self.prepare_for_dn(targets, None, None, x[0].shape[0])
+            if mask_dict is not None:
+                tgt=torch.cat([input_query_label, tgt],dim=1)
+
+        if self.initial_pred:
+            outputs_class, outputs_mask = self.forward_prediction_heads(tgt.transpose(0, 1), mask_features)
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+        if self.dn != "no" and self.training and mask_dict is not None:
+            refpoint_embed=torch.cat([input_query_bbox,refpoint_embed],dim=1)
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=src_flatten.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=None,
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=tgt_mask
+        )
+        for i, output in enumerate(hs):
+            outputs_class, outputs_mask = self.forward_prediction_heads(output.transpose(0, 1), mask_features)
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+
+        if self.initial_pred:
+            out_boxes=self.pred_box(references, hs, refpoint_embed.sigmoid())
+            assert len(predictions_class) == self.num_layers + 1
+        else:
+            out_boxes = self.pred_box(references, hs)
+        if mask_dict is not None:
+            predictions_mask=torch.stack(predictions_mask)
+            predictions_class=torch.stack(predictions_class)
+            predictions_class, out_boxes,predictions_mask=\
+                self.dn_post_process(predictions_class,out_boxes,mask_dict,predictions_mask)
+            predictions_class,predictions_mask=list(predictions_class),list(predictions_mask)
+        elif self.training:
+            predictions_class[-1] += 0.0*self.label_enc.weight.sum()
+        out = {
+            'pred_logits': predictions_class[-1],
+            'pred_masks': predictions_mask[-1],
+            'pred_boxes':out_boxes[-1],
+            'aux_outputs': self._set_aux_loss(
+                predictions_class if self.mask_classification else None, predictions_mask,out_boxes
+            )
+        }
+        if self.two_stage:
+            out['interm_outputs'] = interm_outputs
+        return out, mask_dict
+
+    def forward_prediction_heads(self, output, mask_features):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+
+        return outputs_class, outputs_mask
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks,out_boxes=None):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        # if self.mask_classification:
+        if out_boxes is None:
+            return [
+                {"pred_logits": a, "pred_masks": b}
+                for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
+            ]
+        else:
+            return [
+                {"pred_logits": a, "pred_masks": b, "pred_boxes":c}
+                for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],out_boxes[:-1])
+            ]
\ No newline at end of file
diff --git a/projects/maskdino/modeling/transformer_decoder/utils.py b/projects/maskdino/modeling/transformer_decoder/utils.py
new file mode 100644
index 00000000..302c049b
--- /dev/null
+++ b/projects/maskdino/modeling/transformer_decoder/utils.py
@@ -0,0 +1,123 @@
+# import torch
+# import copy
+# from torch import nn, Tensor
+# import os
+#
+# import math
+# import torch.nn.functional as F
+# from torch import nn
+#
+#
+# class MLP(nn.Module):
+#     """ Very simple multi-layer perceptron (also called FFN)"""
+#
+#     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+#         super().__init__()
+#         self.num_layers = num_layers
+#         h = [hidden_dim] * (num_layers - 1)
+#         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+#
+#     def forward(self, x):
+#         for i, layer in enumerate(self.layers):
+#             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+#         return x
+#
+#
+# def inverse_sigmoid(x, eps=1e-5):
+#     x = x.clamp(min=0, max=1)
+#     x1 = x.clamp(min=eps)
+#     x2 = (1 - x).clamp(min=eps)
+#     return torch.log(x1/x2)
+#
+#
+# def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor):
+#     """
+#     Input:
+#         - memory: bs, \sum{hw}, d_model
+#         - memory_padding_mask: bs, \sum{hw}
+#         - spatial_shapes: nlevel, 2
+#     Output:
+#         - output_memory: bs, \sum{hw}, d_model
+#         - output_proposals: bs, \sum{hw}, 4
+#     """
+#     N_, S_, C_ = memory.shape
+#     base_scale = 4.0
+#     proposals = []
+#     _cur = 0
+#     for lvl, (H_, W_) in enumerate(spatial_shapes):
+#         mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+#         valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+#         valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+#
+#         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+#                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+#         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+#
+#         scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+#         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+#         wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+#         proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+#         proposals.append(proposal)
+#         _cur += (H_ * W_)
+#     output_proposals = torch.cat(proposals, 1)
+#     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+#     output_proposals = torch.log(output_proposals / (1 - output_proposals))
+#     output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+#     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+#
+#     output_memory = memory
+#     output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+#     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+#     return output_memory, output_proposals
+#
+#
+# def gen_sineembed_for_position(pos_tensor):
+#     # n_query, bs, _ = pos_tensor.size()
+#     # sineembed_tensor = torch.zeros(n_query, bs, 256)
+#     scale = 2 * math.pi
+#     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+#     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+#     x_embed = pos_tensor[:, :, 0] * scale
+#     y_embed = pos_tensor[:, :, 1] * scale
+#     pos_x = x_embed[:, :, None] / dim_t
+#     pos_y = y_embed[:, :, None] / dim_t
+#     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+#     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+#     if pos_tensor.size(-1) == 2:
+#         pos = torch.cat((pos_y, pos_x), dim=2)
+#     elif pos_tensor.size(-1) == 4:
+#         w_embed = pos_tensor[:, :, 2] * scale
+#         pos_w = w_embed[:, :, None] / dim_t
+#         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+#
+#         h_embed = pos_tensor[:, :, 3] * scale
+#         pos_h = h_embed[:, :, None] / dim_t
+#         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+#
+#         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+#     else:
+#         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+#     return pos
+#
+#
+# def _get_activation_fn(activation):
+#     """Return an activation function given a string"""
+#     if activation == "relu":
+#         return F.relu
+#     if activation == "gelu":
+#         return F.gelu
+#     if activation == "glu":
+#         return F.glu
+#     if activation == "prelu":
+#         return nn.PReLU()
+#     if activation == "selu":
+#         return F.selu
+#     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+#
+#
+# def _get_clones(module, N, layer_share=False):
+#     # import ipdb; ipdb.set_trace()
+#     if layer_share:
+#         return nn.ModuleList([module for i in range(N)])
+#     else:
+#         return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
\ No newline at end of file
diff --git a/projects/maskdino/utils/__init__.py b/projects/maskdino/utils/__init__.py
new file mode 100644
index 00000000..3a668d94
--- /dev/null
+++ b/projects/maskdino/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# import misc
\ No newline at end of file
diff --git a/projects/maskdino/utils/box_ops.py b/projects/maskdino/utils/box_ops.py
new file mode 100644
index 00000000..3330d5ce
--- /dev/null
+++ b/projects/maskdino/utils/box_ops.py
@@ -0,0 +1,137 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    # import ipdb; ipdb.set_trace()
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / (union + 1e-6)
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / (area + 1e-6)
+
+
+
+# modified from torchvision to also return the union
+def box_iou_pairwise(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+
+    union = area1 + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou_pairwise(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    Input:
+        - boxes1, boxes2: N,4
+    Output:
+        - giou: N, 4
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    assert boxes1.shape == boxes2.shape
+    iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
+
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    area = wh[:, 0] * wh[:, 1]
+
+    return iou - (area - union) / area
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float, device=masks.device)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+if __name__ == '__main__':
+    x = torch.rand(5, 4)
+    y = torch.rand(3, 4)
+    iou, union = box_iou(x, y)
+    import ipdb; ipdb.set_trace()
\ No newline at end of file
diff --git a/projects/maskdino/utils/misc.py b/projects/maskdino/utils/misc.py
new file mode 100644
index 00000000..dae3c9eb
--- /dev/null
+++ b/projects/maskdino/utils/misc.py
@@ -0,0 +1,135 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+import torchvision
+from torch import Tensor
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = masks * x.unsqueeze(0)
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = masks * y.unsqueeze(0)
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
\ No newline at end of file
diff --git a/projects/maskdino/utils/utils.py b/projects/maskdino/utils/utils.py
new file mode 100644
index 00000000..4d2c4170
--- /dev/null
+++ b/projects/maskdino/utils/utils.py
@@ -0,0 +1,123 @@
+import torch
+import copy
+from torch import nn, Tensor
+import os
+
+import math
+import torch.nn.functional as F
+from torch import nn
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+
+def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))
+    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+    return output_memory, output_proposals
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * torch.div(dim_t, 2,rounding_mode='trunc') / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2f2251ee..1674b359 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
 from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
 
 # detrex version info
-version = "0.2.0"
+version = "0.2.1"
 package_name = "detrex"
 cwd = os.path.dirname(os.path.abspath(__file__))