-
Notifications
You must be signed in to change notification settings - Fork 152
/
Copy pathtransform.py
314 lines (257 loc) · 11 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) 2020, Zhiqiang Wang. All Rights Reserved.
import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torchvision
from torchvision.ops import box_convert
from typing import Dict, Optional, List, Tuple
class NestedTensor:
"""
Structure that holds a list of images (of possibly
varying sizes) as a single tensor.
This works by padding the images to the same size,
and storing in a field the original sizes of each image
"""
def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
"""
Args:
tensors (Tensor)
image_sizes (list[tuple[int, int]])
"""
self.tensors = tensors
self.image_sizes = image_sizes
def to(self, device) -> "NestedTensor":
cast_tensor = self.tensors.to(device)
return NestedTensor(cast_tensor, self.image_sizes)
def __repr__(self):
return str(self.tensors)
class YOLOTransform(nn.Module):
"""
Performs input / target transformation before feeding the data to a GeneralizedRCNN
model.
The transformations it perform are:
- input normalization (mean subtraction and std division)
- input / target resizing to match min_size / max_size
It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
"""
def __init__(
self,
min_size: int,
max_size: int,
fixed_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Note: When ``fixed_size`` is set, the ``min_size`` and ``max_size`` won't take effect.
"""
super().__init__()
if not isinstance(min_size, (list, tuple)):
min_size = (min_size,)
self.min_size = min_size
self.max_size = max_size
self.fixed_size = fixed_size
def forward(
self,
images: List[Tensor],
targets: Optional[List[Dict[str, Tensor]]] = None,
) -> Tuple[NestedTensor, Optional[Tensor]]:
device = images[0].device
images = [img for img in images]
if targets is not None:
# make a copy of targets to avoid modifying it in-place
# once torchscript supports dict comprehension
# this can be simplified as as follows
# targets = [{k: v for k,v in t.items()} for t in targets]
targets_copy: List[Dict[str, Tensor]] = []
for t in targets:
data: Dict[str, Tensor] = {}
for k, v in t.items():
data[k] = v.to(device)
targets_copy.append(data)
targets = targets_copy
for i in range(len(images)):
image = images[i]
target_index = targets[i] if targets is not None else None
if image.dim() != 3:
raise ValueError("images is expected to be a list of 3d tensors "
"of shape [C, H, W], got {}".format(image.shape))
image, target_index = self.resize(image, target_index)
images[i] = image
if targets is not None and target_index is not None:
targets[i] = target_index
image_sizes = [img.shape[-2:] for img in images]
images = nested_tensor_from_tensor_list(images)
image_sizes_list: List[Tuple[int, int]] = []
for image_size in image_sizes:
assert len(image_size) == 2
image_sizes_list.append((image_size[0], image_size[1]))
image_list = NestedTensor(images, image_sizes_list)
if targets is not None:
targets_batched = []
for i, target in enumerate(targets):
num_objects = len(target['labels'])
if num_objects > 0:
targets_merged = torch.full((num_objects, 6), i, dtype=torch.float32, device=device)
targets_merged[:, 1] = target['labels']
targets_merged[:, 2:] = target['boxes']
targets_batched.append(targets_merged)
targets_batched = torch.cat(targets_batched, dim=0)
else:
targets_batched = None
return image_list, targets_batched
def torch_choice(self, k: List[int]) -> int:
"""
Implements `random.choice` via torch ops so it can be compiled with
TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
is fixed.
"""
index = int(torch.empty(1).uniform_(0., float(len(k))).item())
return k[index]
def resize(
self,
image: Tensor,
target: Optional[Dict[str, Tensor]] = None,
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
h, w = image.shape[-2:]
if self.training:
size = float(self.torch_choice(self.min_size))
else:
# FIXME assume for now that testing uses the largest scale
size = float(self.min_size[-1])
image, target = _resize_image_and_masks(image, size, float(self.max_size), self.fixed_size, target)
if target is None:
return image, target
bbox = target["boxes"]
bbox = normalize_boxes(bbox, (h, w))
target["boxes"] = bbox
return image, target
def postprocess(
self,
result: List[Dict[str, Tensor]],
image_shapes: List[Tuple[int, int]],
original_image_sizes: List[Tuple[int, int]],
) -> List[Dict[str, Tensor]]:
for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
boxes = pred["boxes"]
boxes = resize_boxes(boxes, im_s, o_im_s)
result[i]["boxes"] = boxes
return result
def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32):
# TODO make this more general
if tensor_list[0].ndim == 3:
if torchvision._is_tracing():
# nested_tensor_from_tensor_list() does not export well to ONNX
# call _onnx_nested_tensor_from_tensor_list() instead
return _onnx_nested_tensor_from_tensor_list(tensor_list, size_divisible)
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
stride = float(size_divisible)
max_size = list(max_size)
max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
batch_shape = [len(tensor_list)] + max_size
tensor_batched = tensor_list[0].new_full(batch_shape, 0)
for img, pad_img in zip(tensor_list, tensor_batched):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
else:
raise ValueError('not supported')
return tensor_batched
def _max_by_axis(the_list: List[List[int]]) -> List[int]:
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
# _onnx_nested_tensor_from_tensor_list() is an implementation of
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32) -> Tensor:
max_size = []
for i in range(tensor_list[0].dim()):
max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
max_size.append(max_size_i)
stride = size_divisible
max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# m[: img.shape[1], :img.shape[2]] = False
# which is not yet supported in onnx
padded_imgs = []
for img in tensor_list:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = F.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
padded_imgs.append(padded_img)
tensor = torch.stack(padded_imgs)
return tensor
@torch.jit.unused
def _get_shape_onnx(image: Tensor) -> Tensor:
from torch.onnx import operators
return operators.shape_as_tensor(image)[-2:]
@torch.jit.unused
def _fake_cast_onnx(v: Tensor) -> float:
# ONNX requires a tensor but here we fake its type for JIT.
return v
def _resize_image_and_masks(
image: Tensor,
self_min_size: float,
self_max_size: float,
fixed_size: Optional[Tuple[int, int]] = None,
target: Optional[Dict[str, Tensor]] = None,
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
"""
Resize the image and its targets
"""
if torchvision._is_tracing():
im_shape = _get_shape_onnx(image)
else:
im_shape = torch.tensor(image.shape[-2:])
size: Optional[List[int]] = None
scale_factor: Optional[float] = None
recompute_scale_factor: Optional[bool] = None
if fixed_size is not None:
size = [fixed_size[1], fixed_size[0]]
else:
min_size = torch.min(im_shape).to(dtype=torch.float32)
max_size = torch.max(im_shape).to(dtype=torch.float32)
scale = torch.min(self_min_size / min_size, self_max_size / max_size)
if torchvision._is_tracing():
scale_factor = _fake_cast_onnx(scale)
else:
scale_factor = scale.item()
recompute_scale_factor = True
image = F.interpolate(image[None], size=size, scale_factor=scale_factor, mode='bilinear',
recompute_scale_factor=recompute_scale_factor, align_corners=False)[0]
if target is None:
return image, target
if "masks" in target:
mask = target["masks"]
mask = F.interpolate(mask[:, None].float(), size=size, scale_factor=scale_factor,
recompute_scale_factor=recompute_scale_factor)[:, 0].byte()
target["masks"] = mask
return image, target
def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
ratios = [
torch.tensor(s, dtype=torch.float32, device=boxes.device) /
torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
for s, s_orig in zip(new_size, original_size)
]
ratio_height, ratio_width = ratios
xmin, ymin, xmax, ymax = boxes.unbind(1)
xmin = xmin * ratio_width
xmax = xmax * ratio_width
ymin = ymin * ratio_height
ymax = ymax * ratio_height
return torch.stack((xmin, ymin, xmax, ymax), dim=1)
def normalize_boxes(boxes: Tensor, original_size: List[int]) -> Tensor:
height = torch.tensor(original_size[0], dtype=torch.float32, device=boxes.device)
width = torch.tensor(original_size[1], dtype=torch.float32, device=boxes.device)
xmin, ymin, xmax, ymax = boxes.unbind(1)
xmin = xmin / width
xmax = xmax / width
ymin = ymin / height
ymax = ymax / height
boxes = torch.stack((xmin, ymin, xmax, ymax), dim=1)
# Convert xyxy to cxcywh
return box_convert(boxes, in_fmt='xyxy', out_fmt='cxcywh')