-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsalience_detr_convnext_l_800_1333.py
110 lines (100 loc) · 3.55 KB
/
salience_detr_convnext_l_800_1333.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from torch import nn
from models.backbones.convnext import ConvNeXtBackbone
from models.bricks.position_encoding import PositionEmbeddingSine
from models.bricks.post_process import PostProcess
from models.bricks.salience_transformer import (
SalienceTransformer,
SalienceTransformerDecoder,
SalienceTransformerDecoderLayer,
SalienceTransformerEncoder,
SalienceTransformerEncoderLayer,
)
from models.bricks.set_criterion import HybridSetCriterion
from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
from models.matcher.hungarian_matcher import HungarianMatcher
from models.necks.channel_mapper import ChannelMapper
from models.necks.repnet import RepVGGPluXNetwork
# mostly changed parameters
embed_dim = 256
num_classes = 91
num_queries = 900
num_feature_levels = 4
transformer_enc_layers = 6
transformer_dec_layers = 6
num_heads = 8
dim_feedforward = 2048
# instantiate model components
position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
backbone = ConvNeXtBackbone("conv_l", return_indices=(1, 2, 3), freeze_indices=(0,))
neck = ChannelMapper(
in_channels=backbone.num_channels,
out_channels=embed_dim,
num_outs=num_feature_levels,
)
transformer = SalienceTransformer(
encoder=SalienceTransformerEncoder(
encoder_layer=SalienceTransformerEncoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_enc_layers,
),
neck=RepVGGPluXNetwork(
in_channels_list=neck.num_channels,
out_channels_list=neck.num_channels,
norm_layer=nn.BatchNorm2d,
activation=nn.SiLU,
groups=4,
),
decoder=SalienceTransformerDecoder(
decoder_layer=SalienceTransformerDecoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_dec_layers,
num_classes=num_classes,
),
num_classes=num_classes,
num_feature_levels=num_feature_levels,
two_stage_num_proposals=num_queries,
level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
)
matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
weight_dict.update({
k + f"_{i}": v
for i in range(transformer_dec_layers - 1)
for k, v in weight_dict.items()
})
weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
weight_dict.update({"loss_salience": 2})
criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
postprocessor = PostProcess(select_box_nums_for_evaluation=300)
# combine above components to instantiate the model
model = SalienceDETR(
backbone=backbone,
neck=neck,
position_embedding=position_embedding,
transformer=transformer,
criterion=criterion,
focus_criterion=foreground_criterion,
postprocessor=postprocessor,
num_classes=num_classes,
num_queries=num_queries,
aux_loss=True,
min_size=800,
max_size=1333,
)