-
Notifications
You must be signed in to change notification settings - Fork 14
/
lecture.tex
354 lines (249 loc) · 11 KB
/
lecture.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
\documentclass[serif,xcolor=pdftex,dvipsnames,table,hyperref={bookmarks=false,breaklinks}]{beamer}
\input{../config.tex}
\settitlecard{8}{Linear Regression, Ridge, and Lasso}
\begin{document}
\maketitlepage
\section{Regression}
\subsection{Foo}
\begin{frame}[t]{Views on Machine Learning}
\iconbox{1}{}{../Figures/mitchell.jpg}{\textbf{Mitchell (1997):} ``A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance at tasks in T, as measured by P, improves with experience E.''\\[12pt] Substitute ``training data D'' for ``experience E.''}
\end{frame}
\begin{frame}[t]{The Regression Task}
\begin{block}{Definition: The Regression Task}
Given a feature vector $\mbf{x}\in\mathbb{R}^D$, predict it's corresponding output value $y\in \mathbb{R}$.
\end{block}
\pause
\center
\includegraphics[width=3in]{../Figures/polynomial-function.png}
\end{frame}
\begin{frame}[t]{Example: Stock Prices}
\center
\includegraphics[width=4in]{../Figures/apple-stock.png}
\end{frame}
\begin{frame}[t]{Example: Climate Change}
\center
\includegraphics[width=4in]{../Figures/northern-hemisphere-temperature.png}
\end{frame}
\begin{frame}[t]{Example: Weather Forecasting}
\center
\includegraphics[width=3.5in]{../Figures/weather-forecast.png}
\end{frame}
\begin{frame}[t]{The Regression Learning Problem}
\begin{block}{Definition: Regression Learning Problem}
Given a data set of example pairs $\mathcal{D}=\{(\mbf{x}_i,y_i),i=1:N\}$ where $\mbf{x}_i\in\mathbb{R}^D$ is a feature vector and $y_i\in \mathbb{R}$ is the output, learn a function $f:\mathbb{R}^D\rightarrow \mathbb{R}$ that accurately predicts $y$ for any feature vector $\mbf{x}$.
\end{block}
\end{frame}
\begin{frame}[t]{Example: Linear Regression Learning}
\center
\includegraphics[width=3.5in]{../Figures/regression-learning-example.png}
\end{frame}
\begin{frame}[t]{Example: Non-Linear Regression Learning}
\center
\includegraphics[width=3in]{../Figures/nonlinear-regression.jpg}
\end{frame}
\begin{frame}[t]{Error Measures: MSE}
\begin{block}{Definition: Mean Squared Error}
Given a data set of example pairs $\mathcal{D}=\{(\mbf{x}_i,y_i),i=1:N\}$ and a function $f:\mathbb{R}^D\rightarrow \mathcal{Y}$, the mean squared error of $f$ on $\mathcal{D}$ is:
$$MSE(f,\mathcal{D}) = \frac{1}{N}\sum_{i=1}^N(y_i - f(\mbf{x}_i))^2$$
\end{block}
\pause
Related measures include: \\
Sum of Squared Errors: $SSE(f,\mathcal{D})=N\cdot MSE(f,\mathcal{D})$\\
Risidual Sum of Squares: $RSS(f,\mathcal{D})=N\cdot MSE(f,\mathcal{D})$\\
Root Mean Squared Error: $RMSE(f,\mathcal{D})=\sqrt{MSE(f,\mathcal{D})}$
\end{frame}
\begin{frame}[t]{Error Measures: MAE}
\begin{block}{Definition: Mean Absolute Error}
Given a data set of example pairs $\mathcal{D}=\{(\mbf{x}_i,y_i),i=1:N\}$ and a function $f:\mathbb{R}^D\rightarrow \mathcal{Y}$, the mean absolute error of $f$ on $\mathcal{D}$ is:
$$MAE(f,\mathcal{D}) = \frac{1}{N}\sum_{i=1}^N|y_i - f(\mbf{x}_i)|$$
\end{block}
\end{frame}
\section{Linear regression}
\subsection{foo}
\begin{frame}[t]{Linear Regression}
Linear regression is a parametric regression method that assumes the relationship between $y$ and $\mbf{x}$ is a linear function with parameters $\mbf{w}=[w_1,...,w_D]^T$ and $b$.
\pause
\begin{block}{Linear Regression Function}
$$f_{Lin}(\mbf{x}) = \left(\sum_{d=1}^D w_d x_d\right) + b = \mbf{x}\mbf{w}+b$$
\end{block}
\pause \textbf{Question:} How can we learn the parameter values $\mbf{w}$ and $b$?
\end{frame}
\begin{frame}[t]{Ordinary Least Squares Linear Regression}
Ordinary least squares selects the linear regression parameters to minimize the
mean squared error (MSE) on the training data set:
\pause
$$\mbf{w}^*,b^* = \argmin_{\mbf{w},b} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w}+b)^2$$
\end{frame}
\begin{frame}[t]{Solving OLS For One Feature}
$$\argmin_{w,b} \frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)^2$$
\pause
\includegraphics[width=4in]{../Figures/ols_objective.png}
\end{frame}
\begin{frame}[t]{Solving OLS For One Feature}
$$\argmin_{w,b} \frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)^2$$
\pause
\begin{align*}
\deriv{}{w}\frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)^2&=0\\
\deriv{}{b}\frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)^2&=0\\
\end{align*}
\end{frame}
\begin{frame}[t]{Solving OLS For One Feature}
\begin{align*}
2\frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)x_i&=0\\
2\frac{1}{N}\sum_{i=1}^N(y_i - wx_i-b)&=0
\end{align*}
\pause%
\begin{align*}
w\left(\sum_{i=1}^Nx_i^2\right) + b\left(\sum_{i=1}^Nx_i\right) &=\sum_{i=1}^N(y_ix_i)\\
w\left(\sum_{i=1}^Nx_i\right)+b(N)&= \sum_{i=1}^N(y_i)
\end{align*}
\end{frame}
\begin{frame}[t]{Solving OLS For One Feature}
\begin{align*}
\begin{bmatrix}
\sum_{i=1}^Nx_i^2 & \sum_{i=1}^Nx_i \\
\sum_{i=1}^Nx_i & N\\
\end{bmatrix}
\begin{bmatrix}
w\\
b
\end{bmatrix}
=
\begin{bmatrix}
\sum_{i=1}^Ny_ix_i \\
\sum_{i=1}^Ny_i
\end{bmatrix}
\end{align*}
%
\pause
\begin{align*}
\begin{bmatrix}
w\\
b
\end{bmatrix}
=
\begin{bmatrix}
\sum_{i=1}^Nx_i^2 & \sum_{i=1}^Nx_i \\
\sum_{i=1}^Nx_i & N\\
\end{bmatrix}^{-1}
\begin{bmatrix}
\sum_{i=1}^Ny_ix_i \\
\sum_{i=1}^Ny_i
\end{bmatrix}
\end{align*}
\end{frame}
\begin{frame}[t]{General OLS Solution}
Assume that $\mbf{X}$ is a data matrix with one data case $\mbf{x}_i\in \mathbb{R}^D$ per row, and $\mbf{Y}$ is
a column vector containing the corresponding outputs. The general OLS solution is:
\pause
\begin{align*}
\mbf{w}^* &= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2\\
&= \argmin_{\mbf{w}} \frac{1}{N}(\mbf{Y} - \mbf{X}\mbf{w})^T(\mbf{Y} - \mbf{X}\mbf{w})\\
0 &= \deriv{}{\mbf{w}} \frac{1}{N}(\mbf{Y} - \mbf{X}\mbf{w})^T(\mbf{Y} - \mbf{X}\mbf{w})\\
0&=\mbf{X}^T(\mbf{Y} - \mbf{X}\mbf{w}) \\
\mbf{X}^T\mbf{X}\mbf{w} &= \mbf{X}^T\mbf{Y}\\
\mbf{w}^* & = (\mbf{X}^T\mbf{X})^{-1}\mbf{X}^T\mbf{Y}
\end{align*}
\end{frame}
\begin{frame}[t]{Connection to Probabilistic Models}
This same solution can be derived as the maximum conditional
likelihood estimate for the parameters of a conditional
Normal model. $\sigma^2$ is the noise variance.
\pause
\begin{align*}
P(y|\mbf{x}) = \mathcal{N}(y;\mbf{x}\mbf{w}, \sigma^2)
= \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{1}{2\sigma^2}(y-\mbf{x}\mbf{w})^2\right)
\end{align*}
\pause This view shows that OLS assumes the residuals are Normally distributed.
This assumption is violated in many real world processes that have significant
outliers or heavy-tailed noise.
\end{frame}
\begin{frame}[t]{Strengths and Limitations of OLS}
\begin{itemize}
\item Need at least $D$ data cases to learn a model with a $D$
dimensional feature vector. \pause Otherwise inverse of
$\mbf{X}^T\mbf{X}$ is not defined.
\pause\item Very sensitive to noise and outliers due to MSE objective
function/Normally distributed residuals assumption.
\pause\item Sensitive to co-linear features ($x_i \approx ax_j +b$). \pause Otherwise inverse of
$\mbf{X}^T\mbf{X}$ is not numerically stable.
\pause\item High bias (assumes linear relationships between the features and target).
\pause\item Computation is cubic in data dimension $D$.
\pause\item Variance is generally low unless there are outliers.
\end{itemize}
\end{frame}
\section{Regularization}
\subsection{foo}
\begin{frame}[t]{Regularized Linear Regression}
Just as in classification, regression models require capacity control to avoid overfitting
and numerical stability problems in high dimensions. This is accomplished by
regularizing the weight parameters during learning.
\pause
\begin{align*}
\mbf{w}^* &= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2 + \lambda ||\mbf{w}||\\
&= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2
\mbox{ ... st } ||\mbf{w}|| \leq c
\end{align*}
\end{frame}
\begin{frame}[t]{Ridge Regression}
Ridge regression is the name given to regularized least squares when the weights are penalized using the square
of the $\ell_2$ norm $||\mbf{w}||_2^2 = \mbf{w}^T\mbf{w} = \sum_{d=1}^D w_d^2 $:
\pause
\begin{align*}
\mbf{w}^* &= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2 + \lambda ||\mbf{w}||_2^2\\
&= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2
\mbox{ ... st } ||\mbf{w}||_2^2 \leq c
\end{align*}
\pause
In this case, it is easy to show that the optimal regularized weights are:
$$\mbf{w}^* = (\mbf{X}^T\mbf{X}+\lambda I)^{-1}\mbf{X}^T\mbf{Y}$$
\end{frame}
\begin{frame}[t]{The Lasso}
The Lasso is the name given to regularized least squares when the weights are penalized using the the $\ell_1$ norm $||\mbf{w}||_1 = \sum_{d=1}^D |w_d|$:
\pause
\begin{align*}
\mbf{w}^* &= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2 + \lambda ||\mbf{w}||_1\\
&= \argmin_{\mbf{w}} \frac{1}{N}\sum_{i=1}^N(y_i - \mbf{x}_i\mbf{w})^2
\mbox{ ... st } ||\mbf{w}||_1 \leq c
\end{align*}
\pause
The Lasso problem is a quadratic programming problem. However, it can be solved efficiently for all values of
$\lambda$ using an algorithm called \textit{least angle regression} (LARS). The advantage of the Lasso is that
it simultaneously performs regularization and feature selection.
\end{frame}
\begin{frame}[t]{Lasso vs Ridge}
\center
\includegraphics[width=4in]{../Figures/ridge_vs_lasso.png}
\end{frame}
\begin{frame}[t]{Strengths and Limitations of Ridge and Lasso}
\begin{itemize}
\item Solves the problem of needing at least $D$ data cases to learn a model with a $D$
dimensional feature vector.
\pause\item Solves the problem of co-linear features ($x_i \approx ax_j +b$).
\pause\item MSE objective function still sensitive to noise and outliers, but regularization
can reduce the possibility of very large weights overfitting to outliers.
\pause\item Does not solve bias problem
\pause\item Computation for ridge is still cubic in data dimension $D$, but now need to
determine regularization parameters. Computation for LARS is similar.
\end{itemize}
\end{frame}
\section{Basis Expansion}
\subsection{foo}
\begin{frame}[t]{Basis Expansion}
Just as with linear classification models, linear regression models can be extended to
capture non-linear relationships using basis function expansions. The polynomial basis is
often used for this purpose, although it is not sensible for forecasting.
\pause
\center
\includegraphics[width=4in]{../Figures/polynomial_regression.png}
\end{frame}
\begin{frame}[t]{Strengths and Limitations of Basis Expansion}
\begin{itemize}
\item Does solve the bias problem.
\pause\item MSE objective function still sensitive to noise and outliers. Basis expansions
can easily overfit so need to control capacity.
\pause\item Computation is cubic in the dimensionality of the basis function
expansion. Can be costly.
\end{itemize}
\end{frame}
\end{document}