-
Notifications
You must be signed in to change notification settings - Fork 0
/
auto_tensor.py
421 lines (309 loc) · 13.1 KB
/
auto_tensor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
from typing import Tuple
import numpy as np
from tensor import Tensor, tensor, ones, zeros
import nn
import optim
def accumulate_grad(target: Tensor, grad: np.ndarray):
"""
Accumulate gradient to target Tensor
:param target:
:param grad:
:return: None
"""
# if this is a const, just return
if target.is_const:
return
if isinstance(target.operation, SumOp):
assert target.shape == grad.shape, \
'Cannot take derivative of a sum up tensor explicitly.\n' \
'Please avoid broadcast a sum up tensor and choose another way to calculate.\n' \
'See why softmax as a basic operation'
assert target.shape == grad.shape, \
'tensor and gradient shape not compatible. Tensor: {}, Gradient: {}'.format(target.shape, grad.shape)
target.grad += grad
class Operation:
def __call__(self, lhs: Tensor, rhs: Tensor) -> Tensor:
return self.forward(lhs, rhs)
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
"""
calculate forward new Tensor
:param lhs: left hand operator, used when there is only one operator
:param rhs: right hand operator, None when there is only one operator
:return:
"""
raise NotImplementedError
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
"""
calculate backward new Tensor gradient
:param rhs: left hand operator, used when there is only one operator
:param lhs: right hand operator, None when there is only one operator
:param acc_grad: accumulated gradient until now
:return: None
"""
raise NotImplementedError
class AddOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
new_data = lhs.data + rhs.data
new_name = '({}+{})'.format(lhs.name, rhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
accumulate_grad(lhs, acc_grad)
accumulate_grad(rhs, acc_grad)
class OnesLikeOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.ones_like(lhs.data)
new_name = 'OnesLike({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
pass
class ZerosLikeOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.zeros_like(lhs.data)
new_name = 'ZerosLike({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
pass
class MulOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
new_data = lhs.data * rhs.data
new_name = '({}*{})'.format(lhs.name, rhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
accumulate_grad(lhs, rhs.data * acc_grad)
accumulate_grad(rhs, lhs.data * acc_grad)
class DivOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
new_data = lhs.data / rhs.data
new_name = '({}/{})'.format(lhs.name, rhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
numerator_grad = np.ones_like(lhs.data) / rhs.data
denominator_grad = (-lhs.data) / (rhs.data ** 2)
accumulate_grad(lhs, numerator_grad * acc_grad)
accumulate_grad(rhs, denominator_grad * acc_grad)
class ExpOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.exp(lhs.data)
new_name = 'exp({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
this_grad = np.exp(lhs.data)
accumulate_grad(lhs, this_grad * acc_grad)
class ViewOp(Operation):
"""Not a singleton, sacrifice performance to maintain interface consistency"""
def __init__(self, new_shape: Tuple):
self.new_shape = new_shape
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = lhs.data.reshape(self.new_shape)
new_name = '(view({},{}))'.format(lhs.name, self.new_shape)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
acc_grad_reshape = acc_grad.reshape(lhs.shape)
accumulate_grad(lhs, acc_grad_reshape)
class PermuteOp(Operation):
def __init__(self, axes: Tuple):
self.axes = axes
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.transpose(lhs.data, self.axes)
new_name = '(permute({},{}))'.format(lhs.name, self.axes)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
new_axes = [0] * len(self.axes)
# permute back to original space
for k, v in enumerate(self.axes):
new_axes[v] = k
acc_grad_permute = np.transpose(acc_grad, tuple(new_axes))
accumulate_grad(lhs, acc_grad_permute)
class MatrixMulOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
new_data = np.matmul(lhs.data, rhs.data)
new_name = 'matmul({},{})'.format(lhs.name, rhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
lhs_trans = np.transpose(lhs.data)
rhs_trans = np.transpose(rhs.data)
accumulate_grad(lhs, acc_grad @ rhs_trans)
accumulate_grad(rhs, lhs_trans @ acc_grad)
class CatOp(Operation):
def __init__(self, axes: int):
"""
cat two tensor with the same dim within given axes
:param axes:
"""
self.axes = axes
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert len(lhs.shape) == len(rhs.shape), 'Two tensor should have same dimension'
new_data = np.concatenate((lhs.data, rhs.data), axis=self.axes)
new_name = 'cat({},{},axes={})'.format(lhs.name, rhs.name, self.axes)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
lhs_len = lhs.shape[self.axes]
rhs_len = rhs.shape[self.axes]
lhs_grad, rhs_grad, _ = np.split(acc_grad, [lhs_len, lhs_len + rhs_len], self.axes)
accumulate_grad(lhs, lhs_grad)
accumulate_grad(rhs, rhs_grad)
class SumOp(Operation):
def __init__(self, axes):
self.axes = axes
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.sum(lhs.data, self.axes, keepdims=True)
new_name = 'sum({},axis={})'.format(lhs.name, self.axes)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
repeat_len = lhs.shape[self.axes]
acc_grad = np.repeat(acc_grad, repeat_len, axis=self.axes)
accumulate_grad(lhs, acc_grad)
class LogOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.log(lhs.data) # TODO log(0)
new_name = 'log({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
curr_grad = 1.0 / lhs.data # TODO overflow
accumulate_grad(lhs, curr_grad * acc_grad)
def _cal_softmax_np(data: np.ndarray, axes):
number = np.max(data, axis=axes, keepdims=True) # avoid overflow
exp_input = np.exp(data - number)
denominator = np.sum(exp_input, axis=axes, keepdims=True)
new_data = exp_input / denominator
return new_data
class SoftmaxOp(Operation):
def __init__(self, axes):
self.axes = axes
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = _cal_softmax_np(lhs.data, self.axes)
new_name = 'softmax({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
y = _cal_softmax_np(lhs.data, self.axes)
accumulate_grad(lhs, y * (1 - y) * acc_grad)
class LogSoftmaxOp(Operation):
"""
While mathematically equivalent to log(softmax(x)), doing these two
operations separately is slower, and numerically unstable. This function
uses an alternative formulation to compute the output and gradient correctly.
See. https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
"""
def __init__(self, axes):
self.axes = axes
def _cal_log_softmax_np(self, data: np.ndarray):
b = np.max(data, axis=self.axes, keepdims=True) # avoid overflow
exp_data = np.exp(data - b)
exp_sum = np.sum(exp_data, axis=self.axes, keepdims=True)
new_data = data - b - np.log(exp_sum)
return new_data
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = self._cal_log_softmax_np(lhs.data)
new_name = 'log_softmax({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
y = _cal_softmax_np(lhs.data, self.axes)
accumulate_grad(lhs, (1 - y) * acc_grad)
class ReluOp(Operation):
def forward(self, lhs: Tensor, rhs: Tensor) -> Tensor:
assert not rhs
new_data = np.where(lhs.data > 0, lhs.data, 0)
new_name = 'relu({})'.format(lhs.name)
return Tensor(new_data, new_name, lhs=lhs, rhs=rhs, operation=self)
def backward(self, lhs: Tensor, rhs: Tensor, acc_grad: np.ndarray):
assert not rhs
mask = np.where(lhs.data > 0, 1, 0)
accumulate_grad(lhs, mask * acc_grad)
# singleton factory
zeros_like = ZerosLikeOp()
ones_like = OnesLikeOp()
add_op = AddOp()
mul_op = MulOp()
div_op = DivOp()
exp_op = ExpOp()
matmul = MatrixMulOp()
log_op = LogOp()
relu_op = ReluOp()
def exp(ts: Tensor) -> Tensor:
"""exp operation wrapper"""
return exp_op(ts, None)
def log(ts: Tensor) -> Tensor:
"""log operation wrapper"""
return log_op(ts, None)
def view(ts: Tensor, new_shape: Tuple) -> Tensor:
view_op = ViewOp(new_shape)
return view_op(ts, None)
def transpose(ts: Tensor) -> Tensor:
"""Quick func for 2 dim matrix"""
assert len(ts.shape) == 2
return permute(ts, (1, 0))
def permute(ts: Tensor, axes) -> Tensor:
"""
Same as np.reshape, shape must be specified
:param ts: tensor to permute
:param axes: if None equals to transpose permute(1, 0)
:return:
"""
permute_op = PermuteOp(axes)
return permute_op(ts, None)
def sigmoid(ts: Tensor) -> Tensor:
exp_minus_x = exp(-ts)
return 1 / (1 + exp_minus_x)
def relu(ts: Tensor) -> Tensor:
return relu_op(ts, None)
def cat(ts1: Tensor, ts2: Tensor, axes: int) -> Tensor:
"""
Cat two tensors within given axes
:param ts1:
:param ts2:
:param axes:
:return:
"""
cat_op = CatOp(axes)
return cat_op(ts1, ts2)
def sum(ts: Tensor, axes: int) -> Tensor:
sum_op = SumOp(axes)
return sum_op(ts, None)
def binary_cross_entropy(input: Tensor, target: Tensor) -> Tensor:
"""
calculate mean binary cross entropy between input and target
:param input: (batch_size, 1)
:param target: (batch_size, 1)
:return:
"""
assert input.shape == target.shape, 'input and target have different shape!'
assert len(input.shape) == 2, 'binary cross entropy only used in 2 dim matrix'
assert input.shape[1] == 1, 'binary shape[1] should be 1'
loss = target * log(input) + (1 - target) * log(1 - input)
return -sum(loss, 0) / input.shape[0]
def softmax(input: Tensor, axes: int) -> Tensor:
softmax_op = SoftmaxOp(axes)
return softmax_op(input, None)
def log_softmax(input: Tensor, axes: int) -> Tensor:
"""Avoid overflow"""
log_softmax_op = LogSoftmaxOp(axes)
return log_softmax_op(input, None)
def cross_entropy(input: Tensor, target: Tensor) -> Tensor:
"""
calculate mean binary cross entropy between input and target
:param input: (batch_size, classes)
:param target: (batch_size, ) 1-dim
:return:
"""
norm_log = log_softmax(input, 1)
np_one_hot = np.eye(input.shape[1])[target.data]
tensor_one_hot = tensor(np_one_hot, 'one-hot', False, True)
mask = -norm_log * tensor_one_hot
mask_sum = sum(mask, 1)
loss = sum(mask_sum, 0)
return loss / input.shape[0]