-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lecture_7.html
584 lines (461 loc) · 27.3 KB
/
lecture_7.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>reveal.js – NNML 2020</title>
<meta name="description" content="A framework for easily creating beautiful presentations using HTML">
<meta name="author" content="Hakim El Hattab">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="dist/reset.css">
<link rel="stylesheet" href="dist/reveal.css">
<link rel="stylesheet" href="dist/theme/nmilab.css" id="theme">
<!-- Theme used for syntax highlighting of code -->
<link rel="stylesheet" href="plugin/highlight/monokai.css" id="highlight-theme">
<link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/dreampulse/computer-modern-web-font/master/fonts.css">
<link rel="stylesheet" type="text/css" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
<link rel="stylesheet" type="text/css" href="lib/css/monokai.css">
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/4.0.0/animate.min.css"
/>
<link rel="stylesheet" href="reveal.js-plugins/appearance/appearance.css" />
<link rel="stylesheet" href="plugin/highlight/monokai.css" id="highlight-theme" />
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? 'css/print/pdf.css' : 'css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<script>
//function for evaluation
function solve()
{
let w0 = document.getElementById("w0").value
let w1 = document.getElementById("w1").value
let b = document.getElementById("b").value
document.getElementById("a11").innerHTML = (eval(w0)*1+eval(w1)*1+eval(b)).toFixed(2)
document.getElementById("a10").innerHTML = (eval(w0)*1+eval(w1)*0+eval(b)).toFixed(2)
document.getElementById("a01").innerHTML = (eval(w0)*0+eval(w1)*1+eval(b)).toFixed(2)
document.getElementById("a00").innerHTML = (eval(w0)*0+eval(w1)*0+eval(b)).toFixed(2)
document.getElementById("y11").innerHTML = (((eval(w0)*1+eval(w1)*1+eval(b))>0)*1)
document.getElementById("y10").innerHTML = (((eval(w0)*1+eval(w1)*0+eval(b))>0)*1)
document.getElementById("y01").innerHTML = (((eval(w0)*0+eval(w1)*1+eval(b))>0)*1)
document.getElementById("y00").innerHTML = (((eval(w0)*0+eval(w1)*0+eval(b))>0)*1)
document.getElementById('w0val').innerHTML=eval(w0);
document.getElementById('w1val').innerHTML=eval(w1);
document.getElementById('bval').innerHTML=eval(b);
}
</script>
<script async defer src="https://buttons.github.io/buttons.js"></script>
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="slides">
<section data-external="title.html" data-vertical-align-top data-background-color=#B2BA67 ></section>
<section data-markdown data-vertical-align-top data-background-color=#B2BA67><textarea data-template>
<h1> Lecture 7: Generative models and Autoencoders<br/> </h1>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Goody Bag III Data Loading, Pre-Processing and Dense Nets</h2>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Data Loading</h2>
<ul>
<li/> Loading and pre-processing the data is a very tedious process, often dwarfing the implementation of the neural network
<li /> Machine learning frameworks simplify this process. In PyTorch, these are called dataloaders.
<li /> Several data loaders for popular vision datasets already exist (MNIST, Fashion MNIST, CIFAR10, ImageNet etc.). But often you will have to create your own dataset.
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Goody III.1 ImageFolder and DatasetFolder </h2>
<ul>
<li/> If your dataset is organized as png images in a directory, in the following way:
<pre><code class="Python" data-trim data-noescape>
root/dog/xxx.png
root/dog/xxy.png
root/dog/xxz.png
root/cat/123.png
root/cat/nsdf3.png
root/cat/asd932_.png
</code></pre>
then you can use
<pre><code class="Python" data-trim data-noescape>
torchvision.datasets.ImageFolder
</code></pre>
<li class=fragment />If instead of .png you have other files (not necessarily images), you can use
<pre><code class="Python" data-trim data-noescape>
torchvision.datasets.DatasetFolder
</code></pre>
This class requires as argument a "loader" function that can open the file and produce an array
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Goody III.2 Dataset and DataLoader </h2>
<ul>
<li/> The most general case is to create your own DataSet class as follows:
<pre><code class="Python" data-trim data-noescape>
from torch.utils.data import Dataset
class MyTrainingDataset(Dataset):
def __init__(self, root, transform=None, target_transform=None):
self.root = root
self.transform = transform
self.target_transform = target_transform
self.data = np.load(self.root) #replace this
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
... #write here code to obtain one data sample and target
if self.transform:
sample = self.transform(sample)
if self.target_transform:
target = self.target_transform(target)
return sample, target
</code></pre>
</ul>
</textarea></section>
<!-- <section data-markdown><textarea data-template>
<h2> Goody III.2 Dataset Example: The Wandering Anteater </h2>
<ul>
<li/> Let's create a dataset with the UCI anteater overlayed on grocery store pictures
</ul>
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/open?id=1LqWInxSS198mlqHmGLGt4dNlWbWFVr2U)
</textarea></section>
-->
<section data-markdown data-vertical-align-top><textarea data-template>
<h2> Goody III.3 DenseNets </h2>
<div class=row>
<div class=column>
<img src="images/densenet_fig1.png" />
</div>
<div class=column>
<ul>
<li>Connects each layer to every other layer in a feed-forward fashion</li>
<li>Outputs of convolutional operations are concatenated</li>
<li>Alleviates vanishing gradients problem</li>
<li>Substantially reduce the number of parame-ters</li>
</ul>
</div>
</div>
<p class=ref><a href=https://arxiv.org/pdf/1608.06993.pdf >Huang et al. 2016</a></p>
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/pytorch_vision_densenet.ipynb)
</textarea></section>
</section>
<section data-markdown data-vertical-align-top><textarea data-template>
<h2> Generative Models </h2>
</textarea></section>
</section>
<section data-markdown data-vertical-align-top><textarea data-template>
<h2> Generative vs. Discrimitive Models </h2>
<ul>
<li/> So far, we have been interested in discriminative models, i.e. learning the distrubution
$$ p(\mathbf{y}|\mathbf{x}) $$
where $\mathbf{x}$ is.
<li /> Generative models instead focus on learning the data distribution
$$ p(\mathbf{x}) $$
For example in MNIST, $p(\mathbf{x})$ may be high around vectors $\mathbf{x}$ corresponding to digits, and zero elsewhere
<li /> Generative models can be used for unsupervised training (no labels), but are more challenging to learn.
<li /> In this class, we will see different approaches for learning generative models
</ul>
</textarea>
</section>
<section data-markdown data-vertical-align-top><textarea data-template>
<h2> Approaches to Generative Modeling </h2>
<img src="images/generative_modeling_approaches.png" class=stretch />
<p class=ref> NIPS 2016 Tutorial: Generative Adversarial Networks, Ian Goodfellow 2017 </p>
</textarea>
</section>
<section data-markdown><textarea data-template>
<h2> Autoencoder </h2>
<ul>
<li /> An Autoencoder (AE) is a neural network that is trained to copy its input to its output.
<li /> AE haves two parts: an encoder $\mathbf{h} = f(\mathbf{x})$ and a decoder $\mathbf{r} = g(\mathbf{h})$
<img src="images/Autoencoder_schema.png" class=large />
<p class=ref> By <a href="//commons.wikimedia.org/w/index.php?title=User:Michela_Massi&action=edit&redlink=1" class="new" title="User:Michela Massi (page does not exist)">Michela Massi</a> - <span class="int-own-work" lang="en"></span>, <a href="https://creativecommons.org/licenses/by-sa/4.0" title="Creative Commons Attribution-Share Alike 4.0">CC BY-SA 4.0</a>, <a href="https://commons.wikimedia.org/w/index.php?curid=80177333"></a> </p>
<li /> The hidden layer activations $\mathbf{h}$ describes a <em>code</em> used to represent the input
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Autoencoder </h2>
<ul>
<li /> AEs are designed to be unable to replicate the input perfectly, <em>e.g.</em> they cannot learn $\mathbf{x} = g(f(\mathbf{x}))$ exactly.
<li />Instead, the capability of the AE is limited so that it learns useful properties/patterns of the data. Otherwise the network could learn $g^{-1}$
<li /> In practice, $f$ and $g$ are neural networks as we have seen so far, <em> e.g. </em> feed forward or convolutional networks.
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Undercomplete AE</h2>
<ul>
<li/> In AE we are interested in $\mathbf{h}$, <em>i.e.</em> the output of the encoder.
<li/> One way to prevent the AE to learn the identitiy function is to choose a code whose dimension is smaller than the input. This is called an undercomplete AE.
<li class=fragment data-fragment-index="2" /> Learning the undercomplete representation forces the autoencoder to capture the most salient features of the training data. This is the classical idea of "bottleneck"
<li class=fragment data-fragment-index="2"/> The loss function is: $$L(\mathbf{x}, g(f(\mathbf{x})))$$ where $L$ is typically Mean Squared Error.
<li class=fragment data-fragment-index="3" /> Undercomplete AEs are limited by the fact that the code dimension and network complexity must remain smaller than the input.
</ul>
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/open?id=1doORfr4mUh-IQKwDS_p3xVTLuTrU94yz)
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Regularized AE </h2>
<ul>
<li/> Another way to limit capacity is to use regularization
<li /> Regularized AEs encourages properties other than "copying" its input, such as sparsity of the representation, smooth representations, and robustness to noise.
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Example Regularized AE: Sparse AE</h2>
<ul>
<li/> A sparse autoencoder used a sparseness penalty, such as L1 norm. This forces many components in $\mathbf{h}$ to be zero.
$$ L(\mathbf{x}, g(f(\mathbf{x}))) + \lambda \sum_i |h_i| $$
<li class=fragment /> There is a loose connection with log-likelihoods optimization:
$$
\begin{split}
p(\mathbf{x}) &= \sum_\mathbf{h} p(\mathbf{x},\mathbf{h})\\
p(\mathbf{x},\mathbf{h}) & = p(\mathbf{x}|\mathbf{h})p(\mathbf{h})\\
\log p(\mathbf{x},\mathbf{h}) & = \log p(\mathbf{x}|\mathbf{h}) + \log p(\mathbf{h})\\
\end{split}
$$
such that the regularizing term is a prior on $\mathbf{h}$. In this case, the prior distribution is a Laplace prior
$$
p(h_i) = \frac{\lambda}{2} \exp(\lambda |h_i|)
$$
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Denoising Autoencoder </h2>
<ul>
<li/> Another way to reduce capacity is to add noise to the data (<em> dropout </em>)
<li/> A denoising autoencoder (DAE) instead minimizes
$$ L(\mathbf{x}, g(f(\mathbf{\tilde{x}}))) $$
where
$\tilde{\mathbf{x}}$ is a copy of $\mathbf{x}$ that has been corrupted by some form of noise.
<li /> The noise can be simply additive Gaussian noise
<img src="images/denoising_ae_example.png" class=small />
<li /> Denoising training forces $\mathbf{f}$ and $\mathbf{g}$ to learn the structure of the input $\mathbf{x}$
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> In-class Assignment: Improve AE</h2>
- Using a technique of your choice (regularization, noise, sparsity), try to improve the reconstruction quality on test data.
- Use a visualization of your choice (PCA, ICA, t-SNE) to visualize the latent space.
- Extra: Use convolutional and transposed convolutional layers to implement a convolutional AE
<pre><code class="Python" data-trim data-noescape>
nn.ConvTranspose2d(16, 1, 2, stride=2)
</code></pre>
</textarea></section>
<section data-markdown><textarea data-template>
<h2>Stochastic Encoders and Decoders </h2>
<ul>
<li /> AEs are typically described in the probabilistic domain, using probabilistic encoders $p_{enc}(\mathbf{h}|\mathbf{x})$ and decoder $p_{dec}(\mathbf{x}|\mathbf{h})$
<img src="images/goodfellow_14_2.png" class=normal />
<li class=fragment /> The decoder provides the conditional distribution $p_{dec}(\mathbf{x}|\mathbf{h})$. The AE can be trained by minimizing $-\log p_{dec}(\mathbf{x}|\mathbf{h})$.
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Denoising Autoencoder using Stochastic Encoders and Decoders </h2>
<ul>
<li/> Images are corrupted by noise: $C(\tilde{\mathbf{x}}|\mathbf{x})$ .
<img src="images/denoising_ae_graph.png" />
<ol>
<li /> Sample $\mathbf{x}$ from training distribution
<li /> Sample a corrupted version $\tilde{\mathbf{x}}$ from $C(\tilde{\mathbf{x}}|\mathbf{x})$
<li /> Use pair $(\tilde{\mathbf{x}},\mathbf{x})$ to evaluate $L$ $$ L = \log p_{decoder}(\mathbf{x}|\mathbf{h}(\tilde{\mathbf{x}}))$$
</ol>
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Learning Manifolds </h3>
<ul>
<li /> AEs exploit the idea that data concentrates around a low-dimensional manifold
<li class=fragment data-fragment-index=2 /> The AE specializes to representing variations that are needed to construct the training samples
<li class=fragment data-fragment-index=3 /> For example, the space of vertical translations is 1D, which defines a line (here the activations are projected on the 2 principal components)
<img src="images/manifold_learning.png" class= small />
</ul>
</textarea></section>
<section data-markdown data-vertical-align-top><textarea data-template>
<h2> Generative Models </h2>
<ul>
<li /> In high dimensional spaces, many vectors $\mathbf{x}$ result in $p(\mathbf{x})$ that vanish. It is more useful to focus around points that are likely.
<li class=fragment data-fragment-index= 2/> For example, if we wish to estimate the half of a digit, it would be very helpful to first decide which digit to draw. The digit forms the <em> latent variable </em> (code), which we called $\mathbf{z}$.
<img src="images/Samples-drawn-from-the-prediction-of-the-lower-half-of-the-MNIST-test-data-digits-based.png" class=small />
<li class=fragment data-fragment-index=3/> For this, we need to create a relationship between $\mathbf{z}$ and $\mathbf{x}$. This means we aim to maximize $p(x)$ for every $\mathbf{x}$ in the dataset, <em> i.e. </em>
$$ p(\mathbf{x}) = \int_\mathbf{z} p(\mathbf{x}|\mathbf{z}) p(\mathbf{z}) \mathrm{d}\mathbf{z} $$
where $\mathbf{z}$ is a latent variable
<li class=fragment data-fragment-index=3/> There are two challenges: how to decide on $\mathbf{z}$, and how to perform the integral.
</ul>
</textarea>
</section>
<section data-markdown><textarea data-template>
<h2> Variational AE (VAE) </h2>
<ul>
<li /> Variational Autoencoders were proposed as approximate but fast algorithms to maximizing $p(\mathbf{x})$
<li /> VAEs assert that $\mathbf{z}$ are drawn from a simple distribution, such as a multivariate Gaussian $N(0,\mathbb{I})$.
<li class=fragment data-fragment-index=2 /> How can this $\mathbf{z}$ be powerful enough? Consider this: Given a random variable with one distribution, it is possible to create any other distribution provided a complex enough function.
<img src="images/from_gaussian_to_arbitrary.png">
<p class=ref>Doersch, 2016, <em>Tutorial on Variational Autoencoders</em> </p>
<li class=fragment data-fragment-index=2 /> So from independent Gaussian-distributed variables, we can obtain any latent variable distribution, which can then be mapped to $\mathbf{x}$ using a some function $f$.
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Variational AE (VAE) </h2>
<ul>
<li/> VAEs use a Gaussian distribution in $p(\mathbf{x}|\mathbf{z})$,
$$ p(\mathbf{x}) = \int_\mathbf{z} N(\mathbf{x}|f_\theta(\mathbf{z}), \sigma^2 \mathbb{I}) p(\mathbf{z}) \mathrm{d}\mathbf{z} = \mathbb{E}_{z\sim p} N(\mathbf{x}|f_\theta(\mathbf{z}), \sigma^2 \mathbb{I})$$
(but any density that can be computed and differentiable with respect to the parameters $\theta$ works.)
<li class=fragment data-fragment-index=2 /> Unfortunately, many samples $\mathbf{z}$ are required to estimate $p(\mathbf{x})$. VAEs solve this problem by restricting the integral to values of $\mathbf{z}$ that are likely to have produced $\mathbf{x}$. This is achieved by using a density $q(\mathbf{z}|\mathbf{x})$ to produce $\mathbf{z}$.
$$ p(\mathbf{x}) = \mathbb{E}_{\mathbf{z}\sim q} p(\mathbf{x}|\mathbf{z})$$
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Variational Lower Bound </h2>
<ul>
<li /> What is the relationship between $\mathbb{E}_{z\sim q} p(\mathbf{x}|\mathbf{z})$ and $p(\mathbf{x})$?
<li class=fragment data-fragment-index=2 /> The following relation, known as Variational lower bound is the cornerstone of all variational Bayes:
$$
\log p(\mathbf{x}) \ge \mathbb{E}_{\mathbf{z}\sim q} \log p(\mathbf{x}|\mathbf{z}) - KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z}))
$$
<li class=fragment data-fragment-index=2 /> By maximizing the right hand side, we optimize $\log p(\mathbf{x})$
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Optimizing the Variational Lower Bound </h2>
$$
\text{ Variational Lower Bound: } \mathbb{E}_{\mathbf{z}\sim q} \log p(\mathbf{x}|\mathbf{z}) - KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z}))
$$
<ul>
<li /> Right term: we need to choose $q(\mathbf{z}|\mathbf{x})$. Typically, it is a Gaussian distribution $N(\mathbf{z}| \mu(\mathbf{x},\theta), \Sigma(\mathbf{x},\theta))$, where $\mu$ and $\Sigma$ are implemented using neural networks. With this choice, the right term can be computed in closed form.
<li /> Left term: We could evaluate it by sampling $\mathbf{z}$ and averaging them. But this is expensive. The solution is to take a "single sample average".
<li /> We now have:
$$
\log p(\mathbf{x}|\mathbf{z}) - KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z}))
$$
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Reparametrization Trick</h2>
$$
\log p(\mathbf{x}|\mathbf{z}) - KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z}))
$$
<ul>
<li /> But there is a problem: we also need to optimize $q$, which was used to <em>sample </em> $\mathbf{z}$. Stochastic gradient descent cannot deal with stochastic variables (stochastic samples are not differentiable). The trick is to sample $\mathbf{z}$ as follows:
$$ z = \mu(\mathbf{x}) + \Sigma^{\frac12}(\mathbf{x})\odot \omega $$
where $\omega \sim N(0,\mathbb{I})$. This trick only works for certain distributions!
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Variational AE (VAE) Summary </h2>
<ul>
<li /> Variational Autoencoders were proposed as approximate but fast algorithms to maximizing $p(\mathbf{x})$. With generative adversarial networks, they are the current state-of-the-art for generative modeling.
<li /> As in AEs, VAEs consist in an encoder and a decoder function, both of which are neural networks.
<li /> The code $\mathbf{z}$ is a random variable.
<ul>
<li /> Encoder $q_\phi(\mathbf{z}|\mathbf{x})$ is typically a gaussian probability density (dashed).
<li /> Decoder $p_\theta(\mathbf{x}|\mathbf{z})$ (solid) is a likelihood function of choice (Bernouilli, Multinomial, Gaussian etc.)
<img src="images/vae_graph.png" />
</ul>
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Variational AE (VAE) Architecture </h2>
<ul>
<li /> $p_\theta(\mathbf{x}|\mathbf{z})$ is a neural network taking inputs $\mathbf{z}$ and producing image
<li /> $q_\phi(\mathbf{z}|\mathbf{x})$ is a neural network producing $\mu(\mathbf{x}) $ and $\mathbf{\Sigma}(\mathbf{x})$
<li /> $\mathbf{z}$ is created using the reparamerization trick $\mathbf{z} = \mu(\mathbf{x}) + \Sigma^{\frac12}(\mathbf{x})\odot \omega $
<img src="https://miro.medium.com/max/3374/1*22cSCfmktNIwH5m__u2ffA.png" />
</ul>
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Variational AE (VAE) Implementation </h2>
<ul>
<li /> The loss function to evaluate is: $$ \log p(\mathbf{x}|\mathbf{z}) - KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z})) $$
<ul>
<li /> $ KL(q(\mathbf{z}||\mathbf{x})|p(\mathbf{z})) = KL(N(\mathbf{z}| \mu(\mathbf{x},\theta), \Sigma(\mathbf{x},\theta))||N(0,\mathbb{I}))$. The KL divergence between two Gaussians can be computed analytically.
<li /> $\log p(\mathbf{x}|\mathbf{z})$, i.e. what is the likelihood of the true data $\mathbf{x}$ given $\mathbf{z}$? This depends on the likelihood function $p(\mathbf{x}|\mathbf{z})$
<ul>
<li />Binary Data $\sim$ Bernouilli distribution: Binary cross entropy loss.
<li />Categorical Data $\sim$ Multinomial distribution: Cross entropy loss.
<li />Real Data $\sim$ Gaussian Density: Mean-Squared Error loss.
</ul>
</ul>
</ul>
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/open?id=1Yh1qtakIrsukG_CYr86obSQiLhD3Gw9K)
</textarea></section>
<section data-markdown><textarea data-template>
<h2> Sources </h2>
<ul>
<li /> Autoencoder: <a href=>Deep Learning, Goodfellow et al. 2015</a>
<li /> Variational Autoencoder: <a href=>Deep Learning, Goodfellow et al. 2015</a>
<li /> Variational Autoencoder: <a href=>Doersch et al. 2016</a>
</ul>
</textarea></section>
</div>
</div>
<script src="dist/reveal.js"></script>
<script src="plugin/zoom/zoom.js"></script>
<script src="plugin/notes/notes.js"></script>
<script src="plugin/math/math.js"></script>
<script src="plugin/search/search.js"></script>
<script src="plugin/markdown/markdown.js"></script>
<script src="plugin/highlight/highlight.js"></script>
<script src="reveal.js-plugins/menu/menu.js"></script>
<script src="reveal.js-plugins/audio-slideshow/plugin.js"></script>
<script src="reveal.js-plugins/audio-slideshow/recorder.js"></script>
<script src="reveal.js-plugins/audio-slideshow/RecordRTC.js"></script>
<script src="reveal.js-plugins/chalkboard/plugin.js"></script>
<script src="reveal.js-plugins/external/external.js"></script>
<script src="reveal.js-plugins/externalcode/externalcode.js"></script>
<script src="reveal.js-plugins/appearance/appearance.js"></script>
<script>
Reveal.configure({ pdfMaxPagesPerSlide: 1, hash: true, slideNumber: true})
// Also available as an ES module, see:
// https://revealjs.com/initialization/
Reveal.initialize({
mouseWheel: false,
width: 1280,
height: 720,
margin: 0.0,
navigationMode: 'grid',
transition: 'fade',
controls: true,
progress: true,
center: true,
hash: true,
// Learn about plugins: https://revealjs.com/plugins/
plugins: [Appearance, RevealZoom, RevealSearch, RevealMarkdown, RevealHighlight, RevealAudioRecorder, RevealAudioSlideshow, RevealMath, RevealExternal, Externalcode, RevealMenu],
menu: { // Menu works best with font-awesome installed: sudo apt-get install fonts-font-awesome
themes: false,
transitions: false,
markers: true,
hideMissingTitles: true,
custom: [
{ title: 'Plugins', icon: '<i class="fa fa-external-link-alt"></i>', src: 'toc.html' },
{ title: 'About', icon: '<i class="fa fa-info"></i>', src: 'about.html' }
]
},
audio: {
prefix: 'audio/lecture_7/', // audio files are stored in the "audio" folder
suffix: '.ogg', // audio files have the ".ogg" ending
textToSpeechURL: null, // the URL to the text to speech converter
defaultNotes: false, // use slide notes as default for the text to speech converter
defaultText: false, // use slide text as default for the text to speech converter
advance: 300, // advance to next slide after given time in milliseconds after audio has played, use negative value to not advance
autoplay: false, // automatically start slideshow
defaultDuration: 1.0, // default duration in seconds if no audio is available
defaultAudios: true, // try to play audios with names such as audio/1.2.ogg
playerOpacity: 0.5, // opacity value of audio player if unfocused
playerStyle: 'position: fixed; bottom: 4px; left: 25%; width: 50%; height:75px; z-index: 33;', // style used for container of audio controls
startAtFragment: false, // when moving to a slide, start at the current fragment or at the start of the slide
},
// ...
keyboard: {
82: function() { Recorder.toggleRecording(); }, // press 'r' to start/stop recording
90: function() { Recorder.downloadZip(); }, // press 'z' to download zip containing audio files
84: function() { Recorder.fetchTTS(); } // press 't' to fetch TTS audio files
}
});
</script>
</body>
</html>