From 12256afd2f11add91943d5eb81c0e00c73345b06 Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Sat, 3 Dec 2022 11:08:03 +0800 Subject: [PATCH 1/6] refine dino model zoo --- projects/dino/README.md | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/projects/dino/README.md b/projects/dino/README.md index ada2a1b3..6b2a7122 100644 --- a/projects/dino/README.md +++ b/projects/dino/README.md @@ -119,10 +119,48 @@ Here we provide the pretrained `DINO` weights based on detrex. 100 58.1 model + + DINO-Focal-Large-4scale +FocalNet-384-LRF-3Level +IN22k +12 +100 +57.5 + model + + DINO-Focal-Large-4scale +FocalNet-384-LRF-3Level +IN22k +36 +100 + + model + + DINO-Focal-Large-4scale +FocalNet-384-LRF-4Level +IN22k +12 +100 + + model + + DINO-Focal-Large-4scale +FocalNet-384-LRF-4Level +IN22k +36 +100 + + model -**Note**: `Swin-X-384` means the backbone pretrained resolution is `384 x 384` and `IN22k to In1k` means the model is pretrained on `ImageNet-22k` and finetuned on `ImageNet-1k`. +**Note**: +- `Swin-X-384` means the backbone pretrained resolution is `384 x 384` and `IN22k to In1k` means the model is pretrained on `ImageNet-22k` and finetuned on `ImageNet-1k`. +- `Focal-LRF-3Level`: means using `Large-Receptive-Field` and `Focal-Level` is setted to `3`, please refer to [FocalNet](https://github.com/microsoft/FocalNet) for more details about the backbone settings. **Notable facts and caveats**: The position embedding of DINO in detrex is different from the original repo. We set the tempureture and offsets in `PositionEmbeddingSine` to `10000` and `-0.5` which may make the model converge a little bit faster in the early stage and get a slightly better results (about 0.1mAP) in 12 epochs settings. From 1c3f43da3ee54a9886a3db0313c2d6129f8929da Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Sat, 3 Dec 2022 13:03:36 +0800 Subject: [PATCH 2/6] add dino-focal-36ep configs --- .../dino_focalnet_large_lrf_384_4scale_36ep.py | 16 ++++++++++++++++ ...ino_focalnet_large_lrf_384_fl4_4scale_36ep.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py create mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py new file mode 100644 index 00000000..9cbd9cc5 --- /dev/null +++ b/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py @@ -0,0 +1,16 @@ +from detrex.config import get_config + +from .dino_focalnet_large_lrf_384_4scale_12ep import ( + train, + dataloader, + optimizer, + model, +) + +# using 36ep scheduler +lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_36ep + +# modify training config +train.max_iter = 270000 +train.init_checkpoint = "/path/to/focalnet_large_lrf_384.pth" +train.output_dir = "./output/dino_focalnet_large_4scale_36ep" diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py new file mode 100644 index 00000000..e1c6a902 --- /dev/null +++ b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py @@ -0,0 +1,16 @@ +from detrex.config import get_config + +from .dino_focalnet_large_lrf_384_fl4_4scale_12ep import ( + train, + dataloader, + optimizer, + model, +) + +# using 36ep scheduler +lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_36ep + +# modify training config +train.max_iter = 270000 +train.init_checkpoint = "/path/to/focalnet_large_lrf_384_fl4.pth" +train.output_dir = "./output/dino_focalnet_large_fl4_4scale_36ep" From b05c9add6074e9878e8917481da87304639c1af5 Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Sun, 4 Dec 2022 10:27:41 +0800 Subject: [PATCH 3/6] refine modelzoo --- projects/dino/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/projects/dino/README.md b/projects/dino/README.md index cbaaa0cc..578c6528 100644 --- a/projects/dino/README.md +++ b/projects/dino/README.md @@ -150,6 +150,15 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 100 58.1 model + + DINO-Swin-L-384-5scale +Swin-Large-384 +IN22k to IN1k +36 +100 +58.5 + model From 277f1f053fa0e81e3efc4a9c4d3c289de2be5706 Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Sun, 4 Dec 2022 12:30:25 +0800 Subject: [PATCH 4/6] refine download links --- projects/dino/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/dino/README.md b/projects/dino/README.md index 578c6528..d1dbbf7c 100644 --- a/projects/dino/README.md +++ b/projects/dino/README.md @@ -178,7 +178,7 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 12 100 57.5 - model + model DINO-Focal-Large-4scale @@ -196,7 +196,7 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 12 100 58.0 - model + model DINO-Focal-Large-4scale @@ -258,8 +258,8 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun **Note**: - `Swin-X-384` means the backbone pretrained resolution is `384 x 384` and `IN22k to In1k` means the model is pretrained on `ImageNet-22k` and finetuned on `ImageNet-1k`. -- ViT backbone using MAE pretraining weights following [ViTDet](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet) which can be downloaded in [MAE](https://github.com/facebookresearch/mae). -- `Focal-LRF-3Level`: means using `Large-Receptive-Field` and `Focal-Level` is setted to `3`, please refer to [FocalNet](https://github.com/microsoft/FocalNet) for more details about the backbone settings. +- ViT backbone using MAE pretraining weights following [ViTDet](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet) which can be downloaded in [MAE](https://github.com/facebookresearch/mae). And it's not stable to train ViTDet-DINO without warmup lr-scheduler. +- `Focal-LRF-3Level`: means using `Large-Receptive-Field (LRF)` and `Focal-Level` is setted to `3`, please refer to [FocalNet](https://github.com/microsoft/FocalNet) for more details about the backbone settings. **Notable facts and caveats**: The position embedding of DINO in detrex is different from the original repo. We set the tempureture and offsets in `PositionEmbeddingSine` to `10000` and `-0.5` which may make the model converge a little bit faster in the early stage and get a slightly better results (about 0.1mAP) in 12 epochs settings. From f68fe285a680db28ec24ed05bd0a92df64de7202 Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Sun, 4 Dec 2022 12:49:33 +0800 Subject: [PATCH 5/6] add 5-scale training configs --- ..._focalnet_large_lrf_384_fl4_5scale_12ep.py | 30 +++++++++++++++++++ ..._focalnet_large_lrf_384_fl4_5scale_36ep.py | 16 ++++++++++ 2 files changed, 46 insertions(+) create mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_12ep.py create mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_36ep.py diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_12ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_12ep.py new file mode 100644 index 00000000..bcf7bfdf --- /dev/null +++ b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_12ep.py @@ -0,0 +1,30 @@ +from .dino_focalnet_large_lrf_384_4scale_12ep import ( + train, + dataloader, + optimizer, + lr_multiplier, + model, +) + +from detectron2.layers import ShapeSpec + + +# modify training config +train.init_checkpoint = "/path/to/focalnet_large_lrf_384_fl4.pth" +train.output_dir = "./output/dino_focalnet_large_fl4_5scale_12ep" + +# convert to 4 focal-level +model.backbone.focal_levels = (4, 4, 4, 4) +model.backbone.focal_windows = (3, 3, 3, 3) + +# convert to 5 scale output features +model.backbone.out_indices = (0, 1, 2, 3) +model.neck.input_shapes = { + "p0": ShapeSpec(channels=192), + "p1": ShapeSpec(channels=384), + "p2": ShapeSpec(channels=768), + "p3": ShapeSpec(channels=1536), +} +model.neck.in_features = ["p0", "p1", "p2", "p3"] +model.neck.num_outs = 5 +model.transformer.num_feature_levels = 5 diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_36ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_36ep.py new file mode 100644 index 00000000..55475955 --- /dev/null +++ b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_5scale_36ep.py @@ -0,0 +1,16 @@ +from detrex.config import get_config + +from .dino_focalnet_large_lrf_384_fl4_5scale_12ep import ( + train, + dataloader, + optimizer, + model, +) + +# using 36ep scheduler +lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_36ep + +# modify training config +train.max_iter = 270000 +train.init_checkpoint = "/path/to/focalnet_large_lrf_384_fl4.pth" +train.output_dir = "./output/dino_focalnet_large_fl4_5scale_36ep" \ No newline at end of file From 12c3487e2309c0eb92b9c37b8d426ebd1cf3974f Mon Sep 17 00:00:00 2001 From: ntianhe ren Date: Mon, 5 Dec 2022 23:26:02 +0800 Subject: [PATCH 6/6] update download links --- .../tutorials/Download_Pretrained_Weights.md | 71 +++++++++++++++++++ docs/source/tutorials/Model_Zoo.md | 16 +++++ projects/dino/README.md | 20 +----- ...dino_focalnet_large_lrf_384_4scale_36ep.py | 16 ----- ..._focalnet_large_lrf_384_fl4_4scale_36ep.py | 16 ----- 5 files changed, 88 insertions(+), 51 deletions(-) delete mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py delete mode 100644 projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py diff --git a/docs/source/tutorials/Download_Pretrained_Weights.md b/docs/source/tutorials/Download_Pretrained_Weights.md index 1f2e7873..02ab0d46 100644 --- a/docs/source/tutorials/Download_Pretrained_Weights.md +++ b/docs/source/tutorials/Download_Pretrained_Weights.md @@ -361,3 +361,74 @@ train.init_checkpoint = "/path/to/mae_pretrain_vit_base.pth" Please refer to [DINO](https://github.com/IDEA-Research/detrex/tree/main/projects/dino) project for more details about the usage of vit backbone. + +## FocalNet +Here we borrowed the download links from the [official implementation](https://github.com/microsoft/FocalNet#imagenet-22k-pretrained) of FocalNet. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelDepthDimKernels#Params. (M)Download
FocalNet-L [2, 2, 18, 2]192[5, 7, 9] 207 download
FocalNet-L [2, 2, 18, 2]192[3, 5, 7, 9] 207 download
FocalNet-XL [2, 2, 18, 2]256[5, 7, 9] 366 download
FocalNet-XL [2, 2, 18, 2]256[3, 5, 7, 9] 207 download
FocalNet-H [2, 2, 18, 2]352[5, 7, 9] 687 download
FocalNet-H [2, 2, 18, 2]352[3, 5, 7, 9] 687 download
+ +
+ Using FocalNet Backbone in Config + +```python +# focalnet-large-4scale baseline +model.backbone = L(FocalNet)( + embed_dim=192, + depths=(2, 2, 18, 2), + focal_levels=(3, 3, 3, 3), + focal_windows=(5, 5, 5, 5), + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=False, + use_layerscale=True, + normalize_modulator=False, + out_indices=(1, 2, 3), +) +``` +
diff --git a/docs/source/tutorials/Model_Zoo.md b/docs/source/tutorials/Model_Zoo.md index 1e176ece..399ecc87 100644 --- a/docs/source/tutorials/Model_Zoo.md +++ b/docs/source/tutorials/Model_Zoo.md @@ -249,6 +249,22 @@ Here we provides our pretrained baselines with **detrex**. And more pretrained w 100 58.5 model + DINO-ViTDet-Large-4scale +ViT +IN1k, MAE +50 +100 +57.5 + model + + DINO-ViTDet-Large-4scale +ViT +IN1k, MAE +50 +100 +58.0 + model + DINO-ViTDet-Base-4scale ViT diff --git a/projects/dino/README.md b/projects/dino/README.md index d1dbbf7c..8749c629 100644 --- a/projects/dino/README.md +++ b/projects/dino/README.md @@ -133,7 +133,7 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 56.9 model - DINO-Swin-L-384-4scale + DINO-Swin-L-384-5scale Swin-Large-384 IN22k to IN1k 12 @@ -179,15 +179,6 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 100 57.5 model - - DINO-Focal-Large-4scale -FocalNet-384-LRF-3Level -IN22k -36 -100 - - model DINO-Focal-Large-4scale @@ -197,15 +188,6 @@ Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, Heun 100 58.0 model - - DINO-Focal-Large-4scale -FocalNet-384-LRF-4Level -IN22k -36 -100 - - model diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py deleted file mode 100644 index 9cbd9cc5..00000000 --- a/projects/dino/configs/dino_focalnet_large_lrf_384_4scale_36ep.py +++ /dev/null @@ -1,16 +0,0 @@ -from detrex.config import get_config - -from .dino_focalnet_large_lrf_384_4scale_12ep import ( - train, - dataloader, - optimizer, - model, -) - -# using 36ep scheduler -lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_36ep - -# modify training config -train.max_iter = 270000 -train.init_checkpoint = "/path/to/focalnet_large_lrf_384.pth" -train.output_dir = "./output/dino_focalnet_large_4scale_36ep" diff --git a/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py b/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py deleted file mode 100644 index e1c6a902..00000000 --- a/projects/dino/configs/dino_focalnet_large_lrf_384_fl4_4scale_36ep.py +++ /dev/null @@ -1,16 +0,0 @@ -from detrex.config import get_config - -from .dino_focalnet_large_lrf_384_fl4_4scale_12ep import ( - train, - dataloader, - optimizer, - model, -) - -# using 36ep scheduler -lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_36ep - -# modify training config -train.max_iter = 270000 -train.init_checkpoint = "/path/to/focalnet_large_lrf_384_fl4.pth" -train.output_dir = "./output/dino_focalnet_large_fl4_4scale_36ep"