-
Notifications
You must be signed in to change notification settings - Fork 535
/
bayesian-learning-exercises.tex
165 lines (142 loc) · 8.02 KB
/
bayesian-learning-exercises.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
%%%% 20.1: Statistical Learning (4 exercises, 2 labelled)
%%%% ====================================================
\begin{exercise}[bayes-candy-exercise]
The data used for \figref{bayes-candy-figure} on \pgref{bayes-candy-figure} can be viewed as being
generated by \(h_5\). For each of the other four hypotheses, generate a
data set of length 100 and plot the corresponding graphs for
\(P(h_i\given \datum_1,\ldots,\datum_N)\) and \(P(\Datum_{N+1}\eq \J{lime}\given \datum_1,\ldots,\datum_N)\).
Comment on your results.
\end{exercise}
% id=20.0 section=20.1
\begin{iexercise}
Repeat \exref{bayes-candy-exercise}, this time plotting the values of
\(P(\Datum_{N+1}\eq \J{lime}\given \hmap)\) and \(P(\Datum_{N+1}\eq \J{lime}\given \hml)\).
\end{iexercise}
% id=20.1 section=20.1
\begin{exercise}[candy-trade-exercise]
Suppose that Ann's utilities for cherry and lime candies are \(c_A\) and
\(\ell_A\), whereas Bob's utilities are \(c_B\) and \(\ell_B\).
(But once Ann has unwrapped a piece of candy, Bob won't buy it.)
Presumably, if Bob likes lime candies much more than Ann,
it would be wise for Ann to~sell~her~bag~of candies once she is sufficiently
sure of its lime content. On the other hand, if Ann unwraps too many
candies in the process, the bag will be worth less. Discuss the problem of determining the
optimal point at which to sell the bag. Determine the expected
utility of the optimal procedure, given the prior distribution from \secref{statistical-learning-section}.
\end{exercise}
% id=20.2 section=20.1
\begin{exercise}
Two statisticians go to the doctor and are both given the same
prognosis: A 40\% chance that the problem is the deadly disease \(A\),
and a 60\% chance of the fatal disease \(B\). Fortunately, there are
anti-\(A\) and anti-\(B\) drugs that are inexpensive, 100\% effective, and
free of side-effects. The statisticians have the choice of taking one
drug, both, or neither. What will the first statistician (an avid
Bayesian) do? How about the second statistician, who always uses the
maximum likelihood hypothesis?
The doctor does some research and discovers that disease \(B\) actually
comes in two versions, dextro-\(B\) and levo-\(B\), which are equally
likely and equally treatable by the anti-\(B\) drug. Now that there are
three hypotheses, what will the two statisticians do?
\end{exercise}
% id=20.3 section=20.1
%%%% 20.2: Learning with Complete Data (6 exercises, 5 labelled)
%%%% ===========================================================
\begin{exercise}[BNB-exercise]
Explain how to apply the boosting method of \chapref{concept-learning-chapter} to
naive Bayes learning. Test the performance of the resulting algorithm on the restaurant learning problem.
\end{exercise}
% id=20.4 section=20.2.2
\begin{exercise}[linear-regression-exercise]
Consider \(N\) data points \((x_j,y_j)\), where the \(y_j\)s are generated
from the \(x_j\)s according to the linear Gaussian model in
\eqref{linear-gaussian-likelihood-equation}. Find the values of
\(\theta_1\), \(\theta_2\), and \(\sigma\) that maximize the
conditional log likelihood of the data.
\end{exercise}
% id=20.5 section=20.2.3
\begin{exercise}[noisy-OR-ML-exercise]
Consider the noisy-OR model for fever described in \secref{canonical-distribution-section}.
Explain how to apply maximum-likelihood learning to fit the parameters of such a model to
a set of complete data. ({\em Hint}: use the chain rule for partial derivatives.)
\end{exercise}
% id=20.6 section=20.2.1
\begin{exercise}[beta-integration-exercise]
This exercise investigates properties of the Beta distribution defined
in \eqref{beta-equation}.
\begin{enumerate}
\item By integrating over the range \([0,1]\), show that the normalization
constant for the distribution \(\BetaDist[a,b]\) is given by
\(\alpha = \Gamma(a+b)/\Gamma(a)\Gamma(b)\) where \(\Gamma(x)\) is the \newterm{Gamma function}\ntindex{Gamma function}, defined by
\(\Gamma(x+1)\eq x\cdot\Gamma(x)\) and \(\Gamma(1)\eq 1\). (For integer
\(x\), \(\Gamma(x+1)\eq x!\).)
\item Show that the mean is \(a/(a+b)\).
\item Find the mode(s) (the most likely value(s) of \(\theta\)).
\item Describe the distribution \(\BetaDist[\epsilon,\epsilon]\) for
very small \(\epsilon\). What happens as such a distribution is updated?
\end{enumerate}
\end{exercise}
% id=20.7 section=20.2.4
\begin{exercise}[ML-parents-exercise]
Consider an arbitrary Bayesian network, a complete data set for that network,
and the likelihood for the data set according to the network.
Give a simple proof that the likelihood of the data cannot decrease if we
add a new link to the network and recompute the maximum-likelihood parameter values.
\end{exercise}
% id=20.8 section=20.2.5
\begin{uexercise} %fe-s02
Consider a single Boolean random variable $Y$ (the
``classification''). Let the prior probability $P(Y\eq true)$ be $\pi$.
Let's try to find $\pi$, given a training set $D\eq (y_1,\ldots,y_N)$ with
$N$ independent samples of $Y$. Furthermore, suppose $p$ of the $N$
are positive and $n$ of the $N$ are negative.
\begin{enumerate}
\item Write down an
expression for the likelihood of $D$ (i.e., the probability of seeing this particular sequence
of examples, given a fixed value of $\pi$) in terms of $\pi$, $p$, and $n$.
\item By differentiating the log likelihood $L$, find the value of $\pi$ that maximizes the likelihood.
\item Now suppose we add in $k$ Boolean random variables $X_1, X_2,\ldots,X_k$ (the ``attributes'')
that describe each sample, and
suppose we assume that the
attributes are conditionally independent of each other given the
goal $Y$. Draw the Bayes net corresponding to this assumption.
\item Write down the likelihood for the data including the attributes, using the following additional notation:
\begin{itemize}
\item $\alpha_i$ is $P(X_i\eq true | Y\eq true)$.
\item $\beta_i$ is $P(X_i\eq true | Y\eq false)$.
\item $p_i^+$ is the count of samples for which $X_i\eq true$ and $Y\eq true$.
\item $n_i^+$ is the count of samples for which $X_i\eq false$ and $Y\eq true$.
\item $p_i^-$ is the count of samples for which $X_i\eq true$ and $Y\eq false$.
\item $n_i^-$ is the count of samples for which $X_i\eq false$ and $Y\eq false$.
\end{itemize}
[{\em Hint}: consider first the probability of seeing a single example
with specified values for $X_1, X_2,\ldots,X_k$ and $Y$.]
\item By differentiating the log likelihood $L$, find the values of $\alpha_i$ and $\beta_i$
(in terms of the various counts) that maximize the likelihood and say in words what these values represent.
\item Let $k = 2$, and consider a data set with 4 all four possible examples of the{\sc xor} function.
Compute the maximum likelihood estimates of $\pi$, $\alpha_1$, $\alpha_2$, $\beta_1$, and $\beta_2$.
\item Given these estimates of $\pi$, $\alpha_1$, $\alpha_2$, $\beta_1$, and $\beta_2$,
what are the posterior probabilities $P(Y\eq true | x_1,x_2)$ for each example?
%\item Comment on the connection between this result and the capabilities of a single-layer perceptron.
\end{enumerate}
\end{uexercise}
% id=extras-28-oct.7 section=20.2.1
%%%% 20.3: Learning with Hidden Variables: The EM Algorithm (1 exercises, 0 labelled)
%%%% ================================================================================
\begin{exercise}
Consider the application of EM to learn the parameters for the network in
\figref{mixture-networks-figure}(a), given the true parameters in \eqref{candy-true-equation}.
\begin{enumerate}
\item Explain why the EM algorithm would not work if there were just two attributes in the model
rather than three.
\item Show the calculations for the first iteration of EM starting from
\eqref{candy-64-equation}.
\item What happens if we start with all the parameters set to the same value \(p\)?
({\it Hint}: you may find it helpful to investigate this empirically
before deriving the general result.)
\item Write out an expression for the log likelihood of the tabulated candy data on \pgref{candy-counts-page} in terms of
the parameters, calculate the partial derivatives with respect to each parameter, and investigate the nature
of the fixed point reached in part (c).
\end{enumerate}
\end{exercise}
% id=20.9 section=20.3.2