make miscellaneous revisions to doc

blei-lab · Mar 8, 2017 · 93adc8e · 93adc8e
1 parent b52030b
commit 93adc8e
Show file tree

Hide file tree

Showing 28 changed files with 151 additions and 124 deletions.
diff --git a/README.md b/README.md
@@ -51,3 +51,4 @@ visualization with TensorBoard.
 + [Edward website](http://edwardlib.org)
 + [Edward Gitter channel](http://gitter.im/blei-lab/edward)
 + [Edward releases](https://github.com/blei-lab/edward/releases)
++ [Edward papers, posters, and slides](https://github.com/edwardlib/papers)
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -41,7 +41,7 @@ USER edward
 ARG python_version=3.5.3-0
 ARG python_qt_version=4
 RUN conda install -y python=${python_version} && \
-    pip install numpy six tensorflow keras prettytensor pystan pymc3 && \
+    pip install numpy six tensorflow keras prettytensor && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn notebook pandas matplotlib nose pyyaml h5py && \
     conda install -y pyqt=${python_qt_version} && \

diff --git a/docker/Dockerfile-gpu b/docker/Dockerfile-gpu
@@ -41,7 +41,7 @@ USER edward
 # Python
 ARG python_version=3.5.3-0
 RUN conda install -y python=${python_version} && \
-    pip install numpy six tensorflow-gpu keras prettytensor pystan pymc3 && \
+    pip install numpy six tensorflow-gpu keras prettytensor && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn notebook pandas matplotlib nose pyyaml h5py && \
     pip install edward && \

diff --git a/docs/notebooks/iclr2017.ipynb b/docs/notebooks/iclr2017.ipynb
@@ -169,7 +169,7 @@
     "\n",
     "def discriminative_network(x):\n",
     "  h = Dense(28 * 28, activation='relu')(x)\n",
-    "  return Dense(h, activation=None)(1)\n",
+    "  return Dense(1, activation=None)(h)\n",
     "\n",
     "# DATA\n",
     "x_train = np.zeros([N, 28 * 28], dtype=np.float32)\n",

diff --git a/docs/tex/api/criticism.tex b/docs/tex/api/criticism.tex
@@ -9,7 +9,7 @@ \subsubsection{Criticism}
 uncover where the model goes wrong. Model criticism helps justify the
 model as an approximation or point to good directions for revising the
 model.
-For background, see the criticism \href{/tutorials/}{tutorials}.
+For background, see the criticism \href{/tutorials/criticism}{tutorial}.
 
 Edward explores model criticism using
 \begin{itemize}

diff --git a/docs/tex/api/model-compositionality.tex b/docs/tex/api/model-compositionality.tex
@@ -65,7 +65,7 @@ \subsubsection{Directed Graphical Models}
 (\texttt{tf.placeholder}) for $\mathbf{x}$ in its graph. During
 training and testing, we feed the placeholder the appropriate values.
 (See the
-\href{/tutorials/bayesian-linear-regression}{Bayesian linear
+\href{/tutorials/supervised-regression}{Bayesian linear
 regression} tutorial as an example.)
 
 \subsubsection{Neural Networks}
@@ -142,7 +142,7 @@ \subsubsection{Bayesian Nonparametrics}
 infinite-dimensional space.
 
 For the collapsed approach, see the
-\href{/tutorials/gp-classification}{Gaussian process classification}
+\href{/tutorials/supervised-classification}{Gaussian process classification}
 tutorial as an example. We specify distributions over the function
 evaluations of the Gaussian process, and the Gaussian process is
 implicitly marginalized out. This approach is also useful for Poisson

diff --git a/docs/tex/api/model.tex b/docs/tex/api/model.tex
@@ -6,7 +6,7 @@ \subsubsection{Model}
 
 A probabilistic model is a joint distribution $p(\mathbf{x},
 \mathbf{z})$ of data $\mathbf{x}$ and latent variables $\mathbf{z}$.
-For background, see the \href{/tutorials/model}{Probability Models tutorial}.
+For background, see the \href{/tutorials/model}{Probabilistic Models tutorial}.
 
 In Edward, we specify models using a simple language of random variables.
 A random variable $\mathbf{x}$ is an object parameterized by

diff --git a/docs/tex/iclr2017.tex b/docs/tex/iclr2017.tex
@@ -154,7 +154,7 @@ \subsubsection{Section 4. Compositional Representations for Inference}
 
 def discriminative_network(x):
   h = Dense(28 * 28, activation='relu')(x)
-  return Dense(h, activation=None)(1)
+  return Dense(1, activation=None)(h)
 
 # DATA
 x_train = np.zeros([N, 28 * 28], dtype=np.float32)
@@ -236,7 +236,7 @@ \subsubsection{Section 4. Compositional Representations for Inference}
 inference.initialize(scale={x: float(N) / M, z: float(N) / M})
 \end{lstlisting}
 For more details, see the
-\href{/api/data-subsampling}{data subsampling} webpage.
+\href{/api/inference-data-subsampling}{data subsampling} webpage.
 
 \subsubsection{Section 5. Experiments}
 
@@ -289,7 +289,7 @@ \subsubsection{Appendix A. Model Examples}
 y = Bernoulli(logits=tf.matmul(tf.nn.tanh(tf.matmul(x, W_0) + b_0), W_1) + b_1)
 \end{lstlisting}
 For an example of it in use, see
-\href{https://github.com/blei-lab/edward/blob/master/examples/getting_started.py}{\texttt{examples/getting_started.py}}
+\href{https://github.com/blei-lab/edward/blob/master/examples/getting_started_example.py}{\texttt{examples/getting_started_example.py}}
 in the Github repository.
 
 \textbf{Figure 11}. Latent Dirichlet allocation \citep{blei2003latent}.
@@ -348,7 +348,7 @@ \subsubsection{Appendix B. Inference Examples}
 
 \textbf{Figure *}. Stochastic variational inference \citep{hoffman2013stochastic}.
 For more details, see the
-\href{/api/data-subsampling}{data subsampling} webpage.
+\href{/api/inference-data-subsampling}{data subsampling} webpage.
 
 \subsubsection{Appendix C. Complete Examples}
 

diff --git a/docs/tex/index.tex b/docs/tex/index.tex
@@ -91,8 +91,9 @@ \subsubsection{Citation}
 Dustin Tran, Alp Kucukelbir, Adji B. Dieng, Maja Rudolph, Dawen Liang,
 and David M. Blei.
 2016.
-\emph{Edward: A library for probabilistic modeling, inference, and criticism.}
-\href{https://arxiv.org/abs/1610.09787}{arXiv preprint arXiv:1610.09787}.
+\href{https://arxiv.org/abs/1610.09787}
+{\emph{Edward: A library for probabilistic modeling, inference, and criticism.}}
+arXiv preprint arXiv:1610.09787.
 \end{quote}
 
 \begin{lstlisting}[class=JSON]
@@ -105,16 +106,16 @@ \subsubsection{Citation}
 \end{lstlisting}
 
 The following article describes the algorithmic foundations of Edward,
-with a companion webpage \href{/iclr2017}{here}.
+with a \href{/iclr2017}{companion webpage here}.
 We recommend citing this article if you are discussing Edward's
 design and methodology.
 
 \begin{quote}
 Dustin Tran, Matthew D. Hoffman, Rif A. Saurous, Eugene Brevdo, Kevin
 Murphy, and David M. Blei.
 2017.
-\emph{Deep Probabilistic Programming.}
-\href{https://arxiv.org/abs/1701.03757}{International Conference on Learning Representations}.
+\href{https://arxiv.org/abs/1701.03757}{\emph{Deep Probabilistic Programming.}}
+International Conference on Learning Representations.
 \end{quote}
 
 \begin{lstlisting}[class=JSON]

diff --git a/docs/tex/tutorials/bayesian-neural-network.tex b/docs/tex/tutorials/bayesian-neural-network.tex
@@ -9,19 +9,19 @@ \subsection{Bayesian neural network}
 comprises of features $\mathbf{x}_n\in\mathbb{R}^D$ and output
 $y_n\in\mathbb{R}$. Define the likelihood for each data point as
 \begin{align*}
-  p(y_n \mid \mathbf{z}, \mathbf{x}_n, \sigma^2)
+  p(y_n \mid \mathbf{w}, \mathbf{x}_n, \sigma^2)
   &=
-  \text{Normal}(y_n \mid \mathrm{NN}(\mathbf{x}_n\;;\;\mathbf{z}), \sigma^2),
+  \text{Normal}(y_n \mid \mathrm{NN}(\mathbf{x}_n\;;\;\mathbf{w}), \sigma^2),
 \end{align*}
 where $\mathrm{NN}$ is a neural network whose weights and biases form
-the latent variables $\mathbf{z}$. Assume $\sigma^2$ is a
+the latent variables $\mathbf{w}$. Assume $\sigma^2$ is a
 known variance.
 
-Define the prior on the weights and biases $\mathbf{z}$ to be the standard normal
+Define the prior on the weights and biases $\mathbf{w}$ to be the standard normal
 \begin{align*}
-  p(\mathbf{z})
+  p(\mathbf{w})
   &=
-  \text{Normal}(\mathbf{z} \mid \mathbf{0}, \mathbf{I}).
+  \text{Normal}(\mathbf{w} \mid \mathbf{0}, \mathbf{I}).
 \end{align*}
 
 Let's build the model in Edward. We define a 3-layer Bayesian neural
@@ -48,17 +48,17 @@ \subsection{Bayesian neural network}
 x = tf.cast(x_train, dtype=tf.float32)
 y = Normal(mu=neural_network(x), sigma=0.1 * tf.ones(N))
 \end{lstlisting}
-This code builds the model assuming the features \texttt{x\_train}
+This program builds the model assuming the features \texttt{x\_train}
 already exists in the Python environment. Alternatively, one can also
 define a TensorFlow placeholder,
 \begin{lstlisting}
 x = tf.placeholder(tf.float32, [N, D])
 \end{lstlisting}
-and feed the placeholder manually during inference.
+The placeholder must be fed with data later during inference.
 
 A toy demonstration is available in the \href{/getting-started}{Getting Started} section.
-Source code is available
+Source code is available at
 \href{https://github.com/blei-lab/edward/blob/master/examples/bayesian_nn.py}
-{here}.
+{\texttt{examples/bayesian_nn.py}} in the Github repository.
 
 \subsubsection{References}\label{references}
diff --git a/docs/tex/tutorials/criticism.tex b/docs/tex/tutorials/criticism.tex
@@ -55,8 +55,6 @@ \subsubsection{Point-based Evaluation}
 ed.evaluate('categorical_accuracy', data={y_post: y_train, x: x_train})
 ed.evaluate('mean_absolute_error', data={y_post: y_train, x: x_train})
 \end{lstlisting}
-The \texttt{data} can be data held-out from training time, making it
-easy to implement cross-validated techniques.
 
 Point-based evaluation applies generally to any setting, including
 unsupervised tasks. For example, we can evaluate the likelihood of
@@ -66,9 +64,9 @@ \subsubsection{Point-based Evaluation}
 \end{lstlisting}
 
 It is common practice to criticize models with data held-out from
-training. To do this, we first perform inference over any local latent
-variables of the held-out data, fixing the global variables.  Then we
-make predictions on the held-out data.
+training. To do this, we must first perform inference over any local
+latent variables of the held-out data, fixing the global variables; we
+demonstrate this below. Then we make predictions on the held-out data.
 
 \begin{lstlisting}[language=Python]
 from edward.models import Categorical
@@ -137,25 +135,22 @@ \subsubsection{Posterior predictive checks}
 The discrepancy can also take latent variables as input, which we pass
 into the PPC.
 \begin{lstlisting}[language=Python]
-def T(xs, zs):
-  return tf.reduce_mean(tf.cast(zs[z], tf.float32))
-
-ppc(T, data={y_post: y_train, x_ph: x_train},
-    latent_vars={z: qz, beta: qbeta})
+ed.ppc(lambda xs, zs: tf.maximum(zs[z]),
+       data={y_post: y_train, x_ph: x_train},
+       latent_vars={z: qz, beta: qbeta})
 \end{lstlisting}
 
 See the \href{/api/criticism}{criticism API} for further details.
 
-PPCs are an excellent tool for revising models, simplifying or
-expanding the current model as one examines how well it fits the data.
-They are inspired by prior checks and classical hypothesis
-testing, under the philosophy that models should be
-criticized under the frequentist perspective of large sample
+PPCs are an excellent tool for revising models---simplifying or
+expanding the current model as one examines its fit to data.
+They are inspired by classical hypothesis testing; these methods
+criticize models under the frequentist perspective of large sample
 assessment.
 
 PPCs can also be applied to tasks such as hypothesis testing, model
 comparison, model selection, and model averaging.  It's important to
-note that while they can be applied as a form of Bayesian hypothesis
+note that while PPCs can be applied as a form of Bayesian hypothesis
 testing, hypothesis testing is generally not recommended: binary
 decision making from a single test is not as common a use case as one
 might believe. We recommend performing many PPCs to get a holistic

diff --git a/docs/tex/tutorials/decoder.tex b/docs/tex/tutorials/decoder.tex
@@ -60,11 +60,12 @@ \subsection{Probabilistic decoder}
 parameterization than probabilities constrained
 from 0 and 1.
 
-An example script using this model can found
+An example script using this model can be found at
 \href{https://github.com/blei-lab/edward/blob/master/examples/vae.py}
-{here}. An example with a convolutional architecture can be found
+{\texttt{examples/vae.py}} in the Github repository.
+An example with a convolutional architecture can be found at
 \href{https://github.com/blei-lab/edward/blob/master/examples/vae_convolutional_prettytensor.py}
-{here}.
+{\texttt{examples/vae_convolutional_prettytensor.py}} in the Github repository.
 %We experiment with this model in the
 %\href{/tutorials/variational-autoencoder}{variational auto-encoder} tutorial.
 

diff --git a/docs/tex/tutorials/gan.tex b/docs/tex/tutorials/gan.tex
@@ -8,9 +8,9 @@ \subsection{Generative Adversarial Networks}
 inferences.
 
 We demonstrate how to use GANs in Edward.
-The script is available
+The script is available at
 \href{https://github.com/blei-lab/edward/blob/master/examples/gan.py}
-{here}.
+{\texttt{examples/gan.py}} in the Github repository.
 
 \begin{lstlisting}[language=Python]
 M = 128  # batch size during training
@@ -45,39 +45,39 @@ \subsubsection{Data}
 \subsubsection{Model}
 
 GANs posit generative models using an implicit mechanism. Given some
-random noise, the data is assumed to be generated by the output of a
+random noise, the data is assumed to be generated by a deterministic
 function of that noise.
 
 Formally, the generative process is
 \begin{align*}
-\mathbf{z} &\sim p(\mathbf{z}), \\
-\mathbf{x} &= G(\mathbf{z}; \theta),
+\mathbf{\epsilon} &\sim p(\mathbf{\epsilon}), \\
+\mathbf{x} &= G(\mathbf{\epsilon}; \theta),
 \end{align*}
-where $G(\cdot; \theta)$ is a neural network that takes the latent
-variable $\mathbf{z}$ as input. The prior $p(\mathbf{z})$ is often
-interpreted as injected random noise to produce stochasticity in a
-physical system; it is typically a fixed uniform or normal
-distribution with some latent dimensionality.
+where $G(\cdot; \theta)$ is a neural network that takes the samples
+$\mathbf{\epsilon}$ as input. The distribution
+$p(\mathbf{\epsilon})$ is interpreted as random noise injected to
+produce stochasticity in a physical system; it is typically a fixed
+uniform or normal distribution with some latent dimensionality.
 
 In Edward, we build the model as follows, using TensorFlow Slim to
 specify the neural network. It defines a 2-layer fully connected neural
 network and outputs a vector of length $28\times28$ with values in
 $[0,1]$.
 
 \begin{lstlisting}[language=Python]
-def generative_network(z):
-  h1 = slim.fully_connected(z, 128, activation_fn=tf.nn.relu)
+def generative_network(eps):
+  h1 = slim.fully_connected(eps, 128, activation_fn=tf.nn.relu)
   x = slim.fully_connected(h1, 784, activation_fn=tf.sigmoid)
   return x
 
 with tf.variable_scope("Gen"):
-  z = Uniform(a=tf.zeros([M, d]) - 1.0, b=tf.ones([M, d]))
-  x = generative_network(z)
+  eps = Uniform(a=tf.zeros([M, d]) - 1.0, b=tf.ones([M, d]))
+  x = generative_network(eps)
 \end{lstlisting}
 
 We aim to estimate parameters of the generative network such
 that the model best captures the data. (Note in GANs, we are
-interested only in parameter estimation and not inference about the
+interested only in parameter estimation and not inference about any
 latent variables.)
 
 Unfortunately, probability models described above do not admit a tractable

diff --git a/docs/tex/tutorials/index.tex b/docs/tex/tutorials/index.tex
@@ -4,7 +4,7 @@ \subsection{Tutorials}
 
 Edward provides a testbed for rapid experimentation and research with
 probabilistic models. Here we show how to apply this process for
-complicated learning tasks.
+diverse learning tasks.
 
 \href{supervised-regression}{Bayesian linear regression} \\
 A fundamental model for supervised learning.
@@ -51,3 +51,9 @@ \subsection{Tutorials}
   \end{itemize}
   \item \href{criticism}{Model criticism}
 \end{itemize}
+
+There are also companion webpages for several papers about Edward.
+\begin{itemize}
+\item
+\href{/iclr2017}{"Deep probabilistic programming" at ICLR 2017}
+\end{itemize}
diff --git a/docs/tex/tutorials/inference-networks.tex b/docs/tex/tutorials/inference-networks.tex
@@ -95,12 +95,12 @@ \subsubsection{Implementation}
 network's parameters) according to
 gradients of the variational objective.
 
-An example script using this variational model can found
+An example script using this variational model can be found at
 \href{https://github.com/blei-lab/edward/blob/master/examples/vae.py}
-{here}.
-An example with a convolutional architecture can be found
+{\texttt{examples/vae.py}} in the Github repository.
+An example with a convolutional architecture can be found at
 \href{https://github.com/blei-lab/edward/blob/master/examples/vae_convolutional_prettytensor.py}
-{here}.
+{\texttt{examples/vae_convolutional_prettytensor.py}} in the Github repository.
 %We experiment with this model in the
 %\href{/tutorials/variational-autoencoder}{variational auto-encoder} tutorial.
 

diff --git a/docs/tex/tutorials/inference.tex b/docs/tex/tutorials/inference.tex
@@ -46,7 +46,7 @@ \subsubsection{Inferring the posterior}
 
 For details on how to specify inference in Edward, see the
 \href{/api/inference}{inference API}. We describe several examples in
-detail in the other inference \href{/tutorials/}{tutorials}.
+detail in the \href{/tutorials/}{tutorials}.
 
 
 \subsubsection{References}\label{references}

diff --git a/docs/tex/tutorials/latent-space-models.tex b/docs/tex/tutorials/latent-space-models.tex
@@ -14,8 +14,9 @@ \subsection{Latent space models for neural data}
 their distance in the latent space.
 
 We will analyze network data from neuroscience.
-The full script can be found
-\href{https://github.com/blei-lab/edward/blob/master/examples/latent_space_model.py}{here}.
+The full script can be found at
+\href{https://github.com/blei-lab/edward/blob/master/examples/latent_space_model.py}
+{\texttt{examples/latent_space_model.py}} in the Github repository.
 
 \subsubsection{Data}
 

diff --git a/docs/tex/tutorials/mixture-density-network.tex b/docs/tex/tutorials/mixture-density-network.tex
@@ -8,9 +8,9 @@ \subsection{Mixture density networks}
 
 We demonstrate how to use MDNs in Edward, leveraging TensorFlow Slim
 to construct neural networks.
-The script is available
+The script is available at
 \href{https://github.com/blei-lab/edward/blob/master/examples/mixture_density_network.py}
-{here}.
+{\texttt{examples/mixture_density_network.py}} in the Github repository.
 
 \subsubsection{Data}
 

diff --git a/docs/tex/tutorials/model.tex b/docs/tex/tutorials/model.tex
@@ -31,5 +31,4 @@ \subsection{Probabilistic models}
 
 For details on how to specify a model in Edward, see the
 \href{/api/model}{model API}. We describe several examples in detail
-in the
-other model \href{/tutorials/}{tutorials}.
+in the \href{/tutorials/}{tutorials}.