-
Notifications
You must be signed in to change notification settings - Fork 10
/
model_architecture.py
417 lines (371 loc) · 17 KB
/
model_architecture.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2022-2023 Apple Inc. All Rights Reserved.
#
import torch
import torch.nn as nn
import torch.nn.functional as F
from easydict import EasyDict
from layers import PointConv, PointConvStridePE, PointConvTransposePE, PointTransformerLayer, PCFLayer, Linear_BN
def get_default_configs(cfg, num_level=5, base_dim=64):
'''
Get the default configurations for the model, returns the config EasyDict populated with the default values
'''
# Number of downsampling stages
cfg.num_level = num_level
# The dimensionality of the first stage
cfg.base_dim = base_dim
# Feature dimensionality for each stage
if 'feat_dim' not in cfg.keys():
cfg.feat_dim = [base_dim * (i + 1) for i in range(cfg.num_level + 1)]
# Whether to use the viewpoint-invariant coordinate transforms
# (Xingyi Li et al. Improving the Robustness of Point Convolution on K-Nearest Neighbor Neighborhoods with a Viewpoint-Invariant Coordinate Transform. WACV 2023)
if 'USE_VI' not in cfg.keys():
cfg.USE_VI = True
# Whether to concatenate positional encoding into features for VI_PointConv, improves performance at the cost of more GPU memory
if 'USE_PE' not in cfg.keys():
cfg.USE_PE = False
# Transformer type: 'PCF' or anything else (PointTransformer)
if 'transformer_type' not in cfg.keys():
cfg.transformer_type = 'PCF'
# Attention type: 'subtraction': subtractive attention, anything else:
# QK-style attention
if 'attention_type' not in cfg.keys():
cfg.attention_type = 'subtraction'
# Whether to use a layer norm in the guidance computation
if 'layer_norm_guidance' not in cfg.keys():
cfg.layer_norm_guidance = False
# Whether to use drop path
if 'drop_path_rate' not in cfg.keys():
cfg.drop_path_rate = 0.
# Whether to use batch normalization
if 'BATCH_NORM' not in cfg.keys():
cfg.BATCH_NORM = True
# Dropout rate
if 'dropout_rate' not in cfg.keys():
cfg.dropout_rate = 0.
# Whether to time the individual components of the PointConv layers
if 'TIME' not in cfg.keys():
cfg.TIME = False
# Whether to use coordinates as features too
if 'USE_XYZ' not in cfg.keys():
cfg.USE_XYZ = True
# The dimensionality of the point cloud coordinates
if 'point_dim' not in cfg.keys():
cfg.point_dim = 3
# Transformer_type: this can be either PCF or PointTransformer
# for now, if it's not set to PCF then we will use PointTransformer
if 'transformer_type' not in cfg.keys():
cfg.transformer_type = 'PCF'
# mid_dim_back: c_mid in the decoder (see the paper)
if 'mid_dim_back' not in cfg.keys():
cfg.mid_dim_back = 1
# Whether to use PointConvs at the original input resolution, they are
# helpful for performance esp. on 10cm and 5cm, but uses more memory esp.
# on 2cm
if 'use_level_1' not in cfg.keys():
cfg.use_level_1 = True
# Whether to use the CUDA kernels for PointConv and PointConvFormer which fuses
# gather operations and save a few milliseconds and some memory
# Right now, using CUDA kernels to train mysteriously reduces training
# accuracy, hence it is only recommmended to be used during testing time
if 'USE_CUDA_KERNEL' not in cfg.keys():
cfg.USE_CUDA_KERNEL = False
return cfg
class PCF_Backbone(nn.Module):
'''
The backbone-only part of PCF
input is from the data loader (features, point cloud coordinates, edges_self, edges_forward, norms)
The entire model, including the backbone and the segmentation decoder
input is from the data loader (features, point cloud coordinates, edges_self, edges_forward, edges_propagate, norms)
Input:
features: 1 x num_points x num_channels: features of all point clouds in the packed format
pointclouds: a list of [1 x num_points[i] x 3]: coordinates of all point clouds at different subsampling levels in the packed format
edges_self: a list of [1 x num_points[i] x K_self]: Neighbor indices between points at the same subsampling level
edges_forward: a list of [1 x num_points[i+1] x K_forward]: Neighbor indices between points at one subsampling level and points at the next subsampling level (more sparse)
Output:
feat_list: a list of [1 x num_points[i] x out_channels[i]], output
features at each subsampling level
'''
def __init__(self, cfg, input_feat_dim=3):
super(PCF_Backbone, self).__init__()
self.cfg = cfg
self.total_level = cfg.num_level
self.guided_level = cfg.guided_level
self.input_feat_dim = input_feat_dim + 3 if cfg.USE_XYZ else input_feat_dim
self.relu = torch.nn.ReLU(inplace=True)
if cfg.USE_VI is True:
weightnet_input_dim = cfg.point_dim + 9
else:
weightnet_input_dim = cfg.point_dim
weightnet = [weightnet_input_dim, cfg.mid_dim[0]] # 2 hidden layer
weightnet_start = [weightnet_input_dim, cfg.mid_dim[0]]
if cfg.use_level_1:
self.selfpointconv = PointConv(
self.input_feat_dim, cfg.base_dim, cfg, weightnet_start)
self.selfpointconv_res1 = PointConvStridePE(
cfg.base_dim, cfg.base_dim, cfg, weightnet_start)
self.selfpointconv_res2 = PointConvStridePE(
cfg.base_dim, cfg.base_dim, cfg, weightnet_start)
else:
self.selfmlp = Linear_BN(
self.input_feat_dim, cfg.base_dim, bn_ver='1d')
self.pointconv = nn.ModuleList()
self.pointconv_res = nn.ModuleList()
for i in range(1, self.total_level):
in_ch = cfg.feat_dim[i - 1]
out_ch = cfg.feat_dim[i]
weightnet = [weightnet_input_dim, cfg.mid_dim[i]]
if i <= self.guided_level:
self.pointconv.append(
PointConvStridePE(
in_ch, out_ch, cfg, weightnet))
else:
if self.cfg.transformer_type == 'PCF':
self.pointconv.append(
PCFLayer(
in_ch,
out_ch,
cfg,
weightnet,
cfg.num_heads))
else:
self.pointconv.append(
PointTransformerLayer(
in_ch, out_ch, cfg.num_heads))
if self.cfg.resblocks[i] == 0:
self.pointconv_res.append(nn.ModuleList([]))
else:
res_blocks = nn.ModuleList()
for _ in range(self.cfg.resblocks[i]):
if i <= self.guided_level:
res_blocks.append(
PointConvStridePE(
out_ch, out_ch, cfg, weightnet))
else:
if self.cfg.transformer_type == 'PCF':
res_blocks.append(
PCFLayer(
out_ch,
out_ch,
cfg,
weightnet,
cfg.num_heads))
else:
res_blocks.append(
PointTransformerLayer(
out_ch, out_ch, cfg.num_heads))
self.pointconv_res.append(res_blocks)
def forward(self, features, pointclouds, edges_self, edges_forward, norms):
# encode pointwise info
pointwise_feat = torch.cat(
[features, pointclouds[0]], -1) if self.cfg.USE_XYZ else features
# level 1 conv, this helps performance significantly on 5cm/10cm inputs
# but have relatively small use on 2cm
if self.cfg.use_level_1:
pointwise_feat, vi_features = self.selfpointconv(
pointclouds[0], pointwise_feat, edges_self[0], norms[0])
pointwise_feat, _ = self.selfpointconv_res1(
pointclouds[0], pointwise_feat, edges_self[0], norms[0], vi_features=vi_features)
pointwise_feat, _ = self.selfpointconv_res2(
pointclouds[0], pointwise_feat, edges_self[0], norms[0], vi_features=vi_features)
else:
# if don't use level 1 convs, then just simply do a linear layer to
# increase the feature dimensionality
pointwise_feat = F.relu(self.selfmlp(pointwise_feat), inplace=True)
feat_list = [pointwise_feat]
for i, pointconv in enumerate(self.pointconv):
if self.cfg.transformer_type != 'PCF':
sparse_feat = pointconv(
pointclouds[i], feat_list[-1], edges_forward[i], pointclouds[i + 1])
else:
sparse_feat, _ = pointconv(
pointclouds[i], feat_list[-1], edges_forward[i], norms[i], pointclouds[i + 1], norms[i + 1])
# print(sparse_feat.shape)
# There is the need to recompute VI features from the neighbors at this level rather than from the previous level, hence need
# to recompute VI features in the first residual block
vi_features = None
for res_block in self.pointconv_res[i]:
if self.cfg.transformer_type != 'PCF':
sparse_feat = res_block(
pointclouds[i + 1], sparse_feat, edges_self[i + 1])
else:
if vi_features is not None:
sparse_feat, _ = res_block(
pointclouds[i + 1], sparse_feat, edges_self[i + 1], norms[i + 1], vi_features=vi_features)
else:
sparse_feat, vi_features = res_block(
pointclouds[i + 1], sparse_feat, edges_self[i + 1], norms[i + 1])
feat_list.append(sparse_feat)
return feat_list
def PCF_Tiny(input_grid_size, base_dim=64):
''' A tiny PCF model.
Input:
input_grid_size: the voxel size at the finest resolution, e.g. 0.02 for 2cm, 0.05 for 5cm, 0.1 for 10cm
Output:
PCF_Backbone: A PCF backbone model
cfg: The configuration options initialized for the PCF backbone model
'''
cfg = EasyDict()
cfg = get_default_configs(cfg, num_level=5, base_dim=base_dim)
cfg.guided_level = 0
cfg.num_heads = 1
cfg.resblocks = [0, 1, 1, 1, 1]
cfg.mid_dim = [4, 4, 4, 4, 4]
cfg.grid_size = [
input_grid_size,
input_grid_size * 2,
input_grid_size * 4,
input_grid_size * 8,
input_grid_size * 16]
return PCF_Backbone(cfg), cfg
# A small model
def PCF_Small(input_grid_size, base_dim=64):
''' A small PCF model.
Input:
input_grid_size: the voxel size at the finest resolution, e.g. 0.02 for 2cm, 0.05 for 5cm, 0.1 for 10cm
Output:
PCF_Backbone: A PCF backbone model
cfg: The configuration options initialized for the PCF backbone model
'''
cfg = EasyDict()
cfg = get_default_configs(cfg, num_level=5, base_dim=base_dim)
cfg.guided_level = 0
cfg.num_heads = 8
cfg.resblocks = [0, 2, 2, 2, 2]
cfg.mid_dim = [4, 4, 4, 4, 4]
cfg.grid_size = [
input_grid_size,
input_grid_size * 2,
input_grid_size * 4,
input_grid_size * 8,
input_grid_size * 16]
return PCF_Backbone(cfg), cfg
# A normal sized model
def PCF_Normal(input_grid_size, base_dim=64):
''' A normal PCF model.
Input:
input_grid_size: the voxel size at the finest resolution, e.g. 0.02 for 2cm, 0.05 for 5cm, 0.1 for 10cm
Output:
PCF_Backbone: A PCF backbone model
cfg: The configuration options initialized for the PCF backbone model
'''
cfg = EasyDict()
cfg = get_default_configs(cfg, num_level=5, base_dim=base_dim)
cfg.guided_level = 0
cfg.num_heads = 8
cfg.resblocks = [0, 2, 4, 6, 6]
cfg.grid_size = [
input_grid_size,
input_grid_size * 2,
input_grid_size * 4,
input_grid_size * 8,
input_grid_size * 16]
cfg.mid_dim = [16, 16, 16, 16, 16]
return PCF_Backbone(cfg), cfg
def PCF_Large(input_grid_size, base_dim=64):
''' A large PCF model.
Input:
input_grid_size: the voxel size at the finest resolution, e.g. 0.02 for 2cm, 0.05 for 5cm, 0.1 for 10cm
Output:
PCF_Backbone: A PCF backbone model
cfg: The configuration options initialized for the PCF backbone model
'''
cfg = EasyDict()
cfg = get_default_configs(cfg, num_level=6, base_dim=base_dim)
cfg.guided_level = 0
cfg.num_heads = 8
cfg.resblocks = [0, 2, 4, 6, 6, 2]
cfg.grid_size = [
input_grid_size,
input_grid_size * 2.5,
input_grid_size * 5,
input_grid_size * 10,
input_grid_size * 20,
input_grid_size * 40]
cfg.mid_dim = [16, 16, 16, 16, 16, 16]
return PCF_Backbone(cfg), cfg
class PointConvFormer_Segmentation(nn.Module):
'''
The entire model, including the backbone and the segmentation decoder
input is from the data loader (features, point cloud coordinates, edges_self, edges_forward, edges_propagate, norms)
Input:
features: 1 x num_points x num_channels: features of all point clouds in the packed format
pointclouds: a list of [1 x num_points[i] x 3]: coordinates of all point clouds at different
subsampling levels in the packed format
edges_self: a list of [1 x num_points[i] x K_self]: Neighbor indices between points at the
same subsampling level
edges_forward: a list of [1 x num_points[i+1] x K_forward]: Neighbor indices between points at
one subsampling level and points at the next subsampling level (more sparse)
edges_propagate: a list of [1 x num_points[i] x K_propagate]: Neighbor indices between points
at one subsampling level and points at the previous subsampling level (more dense)
Output:
fc: 1 x num_points x c, output class label for each point
'''
def __init__(self, cfg):
super(PointConvFormer_Segmentation, self).__init__()
self.cfg = cfg
self.total_level = cfg.num_level
self.pcf_backbone = PCF_Backbone(cfg)
if cfg.USE_VI is True:
weightnet = [cfg.point_dim + 9, cfg.mid_dim_back] # 2 hidden layer
else:
weightnet = [cfg.point_dim, cfg.mid_dim_back]
self.pointdeconv = nn.ModuleList()
self.pointdeconv_res = nn.ModuleList()
for i in range(self.total_level - 2, -1, -1):
in_ch = cfg.feat_dim[i + 1]
if i == 0:
out_ch = cfg.base_dim
else:
out_ch = cfg.feat_dim[i]
mlp2 = [out_ch, out_ch]
self.pointdeconv.append(
PointConvTransposePE(
in_ch, out_ch, cfg, weightnet, mlp2))
if self.cfg.resblocks[i] == 0:
self.pointdeconv_res.append(nn.ModuleList([]))
else:
res_blocks = nn.ModuleList()
for _ in range(self.cfg.resblocks_back[i]):
res_blocks.append(
PointConvStridePE(
out_ch, out_ch, cfg, weightnet))
self.pointdeconv_res.append(res_blocks)
# pointwise_decode
self.fc1 = Linear_BN(cfg.base_dim, cfg.base_dim, bn_ver='1d')
self.dropout_fc = torch.nn.Dropout(
p=cfg.dropout_fc) if cfg.dropout_fc > 0. else nn.Identity()
self.fc2 = nn.Linear(cfg.base_dim, cfg.num_classes)
def forward(
self,
features,
pointclouds,
edges_self,
edges_forward,
edges_propagate,
norms):
feat_list = self.pcf_backbone(
features,
pointclouds,
edges_self,
edges_forward,
norms)
sparse_feat = feat_list[-1]
for i, pointdeconv in enumerate(self.pointdeconv):
cur_level = self.total_level - 2 - i
sparse_feat, _ = pointdeconv(pointclouds[cur_level +
1], sparse_feat, edges_propagate[cur_level], norms[cur_level +
1], pointclouds[cur_level], norms[cur_level], feat_list[cur_level])
vi_features = None
for res_block in self.pointdeconv_res[i]:
if vi_features is not None:
sparse_feat, _ = res_block(
pointclouds[cur_level], sparse_feat, edges_self[cur_level], norms[cur_level], vi_features=vi_features)
else:
sparse_feat, vi_features = res_block(
pointclouds[cur_level], sparse_feat, edges_self[cur_level], norms[cur_level])
feat_list[cur_level] = sparse_feat
fc = self.dropout_fc(F.relu(self.fc1(sparse_feat)))
fc = self.fc2(fc)
return fc