From 45801f570e6eeb1fae130294ae8296ed2f2e00b9 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Thu, 27 May 2021 19:15:24 -0500
Subject: [PATCH 1/4] Incrementally update the RNG state in Model

---
 pymc3/distributions/distribution.py |  2 +-
 pymc3/model.py                      | 44 +++++++++++++++++++++++---
 pymc3/sampling.py                   | 48 ++++++++++++++++-------------
 pymc3/tests/test_distributions.py   | 22 +++++++++++++
 pymc3/tests/test_ndarray_backend.py | 15 ++++-----
 pymc3/tests/test_sampling.py        | 28 ++++++++---------
 pymc3/tests/test_step.py            | 12 +++++---
 7 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 892599e325..d6c89bc75f 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -137,7 +137,7 @@ def __new__(cls, name, *args, **kwargs):
         rng = kwargs.pop("rng", None)
 
         if rng is None:
-            rng = model.default_rng
+            rng = model.next_rng()
 
         if not isinstance(name, string_types):
             raise TypeError(f"Name needs to be a string but got: {name}")
diff --git a/pymc3/model.py b/pymc3/model.py
index 7567b9ac23..6a5f4dab82 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -42,6 +42,7 @@
 from aesara.graph.basic import Constant, Variable, graph_inputs
 from aesara.graph.fg import FunctionGraph, MissingInputError
 from aesara.tensor.random.opt import local_subtensor_rv_lift
+from aesara.tensor.random.var import RandomStateSharedVariable
 from aesara.tensor.sharedvar import ScalarSharedVariable
 from aesara.tensor.var import TensorVariable
 from pandas import Series
@@ -532,6 +533,13 @@ class Model(Factor, WithMemoization, metaclass=ContextMeta):
         parameters can only take on valid values you can set this to
         False for increased speed. This should not be used if your model
         contains discrete variables.
+    rng_seeder: int or numpy.random.RandomState
+        The ``numpy.random.RandomState`` used to seed the
+        ``RandomStateSharedVariable`` sequence used by a model
+        ``RandomVariable``s, or an int used to seed a new
+        ``numpy.random.RandomState``.  If ``None``, a
+        ``RandomStateSharedVariable`` will be generated and used.  Incremental
+        access to the state sequence is provided by ``Model.next_rng``.
 
     Examples
     --------
@@ -615,7 +623,15 @@ def __new__(cls, *args, **kwargs):
         instance._aesara_config = kwargs.get("aesara_config", {})
         return instance
 
-    def __init__(self, name="", model=None, aesara_config=None, coords=None, check_bounds=True):
+    def __init__(
+        self,
+        name="",
+        model=None,
+        aesara_config=None,
+        coords=None,
+        check_bounds=True,
+        rng_seeder: Optional[Union[int, np.random.RandomState]] = None,
+    ):
         self.name = name
         self._coords = {}
         self._RV_dims = {}
@@ -623,9 +639,15 @@ def __init__(self, name="", model=None, aesara_config=None, coords=None, check_b
         self.add_coords(coords)
         self.check_bounds = check_bounds
 
-        self.default_rng = aesara.shared(np.random.RandomState(), name="default_rng", borrow=True)
-        self.default_rng.tag.is_rng = True
-        self.default_rng.default_update = self.default_rng
+        if rng_seeder is None:
+            self.rng_seeder = np.random.RandomState()
+        elif isinstance(rng_seeder, int):
+            self.rng_seeder = np.random.RandomState(rng_seeder)
+        else:
+            self.rng_seeder = rng_seeder
+
+        # The sequence of model-generated RNGs
+        self.rng_seq = []
 
         if self.parent is not None:
             self.named_vars = treedict(parent=self.parent.named_vars)
@@ -931,6 +953,20 @@ def cont_vars(self):
         """All the continuous variables in the model"""
         return list(typefilter(self.value_vars, continuous_types))
 
+    def next_rng(self) -> RandomStateSharedVariable:
+        """Generate a new ``RandomStateSharedVariable``.
+
+        The new ``RandomStateSharedVariable`` is also added to
+        ``Model.rng_seq``.
+        """
+        new_seed = self.rng_seeder.randint(2 ** 30, dtype=np.int64)
+        next_rng = aesara.shared(np.random.RandomState(new_seed), borrow=True)
+        next_rng.tag.is_rng = True
+
+        self.rng_seq.append(next_rng)
+
+        return next_rng
+
     def shape_from_dims(self, dims):
         shape = []
         if len(set(dims)) != len(dims):
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 475ea30a96..cd2020e85e 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -339,7 +339,8 @@ def sample(
         time until completion ("expected time of arrival"; ETA).
     model : Model (optional if in ``with`` context)
     random_seed : int or list of ints
-        A list is accepted if ``cores`` is greater than one.
+        Random seed(s) used by the sampling steps.  A list is accepted if
+        ``cores`` is greater than one.
     discard_tuned_samples : bool
         Whether to discard posterior samples of the tune interval.
     compute_convergence_checks : bool, default=True
@@ -467,10 +468,6 @@ def sample(
             np.random.seed(random_seed)
         random_seed = [np.random.randint(2 ** 30) for _ in range(chains)]
 
-    # TODO: We need to do something about multiple seeds and this single,
-    # shared RNG state.
-    model.default_rng.get_value(borrow=True).seed(random_seed)
-
     if not isinstance(random_seed, abc.Iterable):
         raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")
 
@@ -1004,9 +1001,7 @@ def _iter_sample(
     """
     model = modelcontext(model)
     draws = int(draws)
-    if random_seed is not None:
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
+
     if draws < 1:
         raise ValueError("Argument `draws` must be greater than 0.")
 
@@ -1273,9 +1268,7 @@ def _prepare_iter_population(
     nchains = len(chains)
     model = modelcontext(model)
     draws = int(draws)
-    if random_seed is not None:
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
+
     if draws < 1:
         raise ValueError("Argument `draws` should be above 0.")
 
@@ -1693,8 +1686,12 @@ def sample_posterior_predictive(
         vars_ = model.observed_RVs + model.auto_deterministics
 
     if random_seed is not None:
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
+        warnings.warn(
+            "In this version, RNG seeding is managed by the Model objects. "
+            "See the `rng_seeder` argument in Model's constructor.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     indices = np.arange(samples)
 
@@ -1816,8 +1813,6 @@ def sample_posterior_predictive_w(
         Dictionary with the variables as keys. The values corresponding to the
         posterior predictive samples from the weighted models.
     """
-    # np.random.seed(random_seed)
-
     if isinstance(traces[0], InferenceData):
         n_samples = [
             trace.posterior.sizes["chain"] * trace.posterior.sizes["draw"] for trace in traces
@@ -1832,9 +1827,15 @@ def sample_posterior_predictive_w(
     if models is None:
         models = [modelcontext(models)] * len(traces)
 
+    if random_seed:
+        warnings.warn(
+            "In this version, RNG seeding is managed by the Model objects. "
+            "See the `rng_seeder` argument in Model's constructor.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
     for model in models:
-        if random_seed:
-            model.default_rng.get_value(borrow=True).seed(random_seed)
         if model.potentials:
             warnings.warn(
                 "The effect of Potentials on other parameters is ignored during posterior predictive sampling. "
@@ -1937,6 +1938,7 @@ def sample_prior_predictive(
     model: Optional[Model] = None,
     var_names: Optional[Iterable[str]] = None,
     random_seed=None,
+    mode: Optional[Union[str, Mode]] = None,
 ) -> Dict[str, np.ndarray]:
     """Generate samples from the prior predictive distribution.
 
@@ -1950,6 +1952,8 @@ def sample_prior_predictive(
         samples. Defaults to both observed and unobserved RVs.
     random_seed : int
         Seed for the random number generator.
+    mode:
+        The mode used by ``aesara.function`` to compile the graph.
 
     Returns
     -------
@@ -1977,8 +1981,12 @@ def sample_prior_predictive(
         vars_ = set(var_names)
 
     if random_seed is not None:
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
+        warnings.warn(
+            "In this version, RNG seeding is managed by the Model objects. "
+            "See the `rng_seeder` argument in Model's constructor.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     names = get_default_varnames(vars_, include_transformed=False)
 
@@ -2127,8 +2135,6 @@ def init_nuts(
 
     if random_seed is not None:
         random_seed = int(np.atleast_1d(random_seed)[0])
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
 
     cb = [
         pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff="absolute"),
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index c59f8f567b..832e61381e 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -2944,3 +2944,25 @@ def func(x):
     import pickle
 
     pickle.loads(pickle.dumps(y))
+
+
+def test_distinct_rvs():
+    """Make sure `RandomVariable`s generated using a `Model`'s default RNG state all have distinct states."""
+
+    with pm.Model(rng_seeder=np.random.RandomState(2023532)) as model:
+        X_rv = pm.Normal("x")
+        Y_rv = pm.Normal("y")
+
+        pp_samples = pm.sample_prior_predictive(samples=2)
+
+    assert X_rv.owner.inputs[0] != Y_rv.owner.inputs[0]
+
+    assert len(model.rng_seq) == 2
+
+    with pm.Model(rng_seeder=np.random.RandomState(2023532)):
+        X_rv = pm.Normal("x")
+        Y_rv = pm.Normal("y")
+
+        pp_samples_2 = pm.sample_prior_predictive(samples=2)
+
+    assert np.array_equal(pp_samples["y"], pp_samples_2["y"])
diff --git a/pymc3/tests/test_ndarray_backend.py b/pymc3/tests/test_ndarray_backend.py
index a0ebb91b80..df71e07764 100644
--- a/pymc3/tests/test_ndarray_backend.py
+++ b/pymc3/tests/test_ndarray_backend.py
@@ -209,8 +209,8 @@ def test_combine_true_squeeze_true(self):
 
 class TestSaveLoad:
     @staticmethod
-    def model():
-        with pm.Model() as model:
+    def model(rng_seeder=None):
+        with pm.Model(rng_seeder=rng_seeder) as model:
             x = pm.Normal("x", 0, 1)
             y = pm.Normal("y", x, 1, observed=2)
             z = pm.Normal("z", x + y, 1)
@@ -267,15 +267,16 @@ def test_sample_posterior_predictive(self, tmpdir_factory):
 
         assert save_dir == directory
 
-        with TestSaveLoad.model() as model:
-            model.default_rng.get_value(borrow=True).seed(10)
+        rng = np.random.RandomState(10)
+
+        with TestSaveLoad.model(rng_seeder=rng):
             ppc = pm.sample_posterior_predictive(self.trace)
 
-        with TestSaveLoad.model() as model:
+        rng = np.random.RandomState(10)
+
+        with TestSaveLoad.model(rng_seeder=rng):
             trace2 = pm.load_trace(directory)
-            model.default_rng.get_value(borrow=True).seed(10)
             ppc2 = pm.sample_posterior_predictive(trace2)
-            ppc2f = pm.sample_posterior_predictive(trace2)
 
         for key, value in ppc.items():
             assert (value == ppc2[key]).all()
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 46a2a86915..4bc7a7a99f 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -619,11 +619,13 @@ def test_model_not_drawable_prior(self):
             assert samples["foo"].shape == (40, 200)
 
     def test_model_shared_variable(self):
-        x = np.random.randn(100)
+        rng = np.random.RandomState(9832)
+
+        x = rng.randn(100)
         y = x > 0
         x_shared = aesara.shared(x)
         y_shared = aesara.shared(y)
-        with pm.Model() as model:
+        with pm.Model(rng_seeder=rng) as model:
             coeff = pm.Normal("x", mu=0, sd=1)
             logistic = pm.Deterministic("p", pm.math.sigmoid(coeff * x_shared))
 
@@ -644,12 +646,12 @@ def test_model_shared_variable(self):
         npt.assert_allclose(post_pred["p"], expected_p)
 
     def test_deterministic_of_observed(self):
-        np.random.seed(8442)
+        rng = np.random.RandomState(8442)
 
-        meas_in_1 = pm.aesaraf.floatX(2 + 4 * np.random.randn(10))
-        meas_in_2 = pm.aesaraf.floatX(5 + 4 * np.random.randn(10))
+        meas_in_1 = pm.aesaraf.floatX(2 + 4 * rng.randn(10))
+        meas_in_2 = pm.aesaraf.floatX(5 + 4 * rng.randn(10))
         nchains = 2
-        with pm.Model() as model:
+        with pm.Model(rng_seeder=rng) as model:
             mu_in_1 = pm.Normal("mu_in_1", 0, 1)
             sigma_in_1 = pm.HalfNormal("sd_in_1", 1)
             mu_in_2 = pm.Normal("mu_in_2", 0, 1)
@@ -660,7 +662,6 @@ def test_deterministic_of_observed(self):
             out_diff = in_1 + in_2
             pm.Deterministic("out", out_diff)
 
-            model.default_rng.get_value(borrow=True).seed(0)
             trace = pm.sample(
                 100,
                 chains=nchains,
@@ -670,7 +671,6 @@ def test_deterministic_of_observed(self):
 
             rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
 
-            model.default_rng.get_value(borrow=True).seed(0)
             ppc = pm.sample_posterior_predictive(
                 model=model,
                 trace=trace,
@@ -682,11 +682,11 @@ def test_deterministic_of_observed(self):
             npt.assert_allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
 
     def test_deterministic_of_observed_modified_interface(self):
-        np.random.seed(4982)
+        rng = np.random.RandomState(4982)
 
-        meas_in_1 = pm.aesaraf.floatX(2 + 4 * np.random.randn(100))
-        meas_in_2 = pm.aesaraf.floatX(5 + 4 * np.random.randn(100))
-        with pm.Model() as model:
+        meas_in_1 = pm.aesaraf.floatX(2 + 4 * rng.randn(100))
+        meas_in_2 = pm.aesaraf.floatX(5 + 4 * rng.randn(100))
+        with pm.Model(rng_seeder=rng) as model:
             mu_in_1 = pm.Normal("mu_in_1", 0, 1, testval=0)
             sigma_in_1 = pm.HalfNormal("sd_in_1", 1, testval=1)
             mu_in_2 = pm.Normal("mu_in_2", 0, 1, testval=0)
@@ -969,12 +969,10 @@ def test_multivariate2(self):
         assert sim_ppc["obs"].shape == (20,) + mn_data.shape
 
     def test_layers(self):
-        with pm.Model() as model:
+        with pm.Model(rng_seeder=232093) as model:
             a = pm.Uniform("a", lower=0, upper=1, size=10)
             b = pm.Binomial("b", n=1, p=a, size=10)
 
-        model.default_rng.get_value(borrow=True).seed(232093)
-
         b_sampler = aesara.function([], b)
         avg = np.stack([b_sampler() for i in range(10000)]).mean(0)
         npt.assert_array_almost_equal(avg, 0.5 * np.ones((10,)), decimal=2)
diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index 27fffe6ad4..7f4796755c 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -1657,7 +1657,9 @@ def perform(self, node, inputs, outputs):
                 mout = []
                 coarse_models = []
 
-                with Model() as coarse_model_0:
+                rng = np.random.RandomState(seed)
+
+                with Model(rng_seeder=rng) as coarse_model_0:
                     if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:
@@ -1674,9 +1676,9 @@ def perform(self, node, inputs, outputs):
 
                     coarse_models.append(coarse_model_0)
 
-                coarse_model_0.default_rng.get_value(borrow=True).seed(seed)
+                rng = np.random.RandomState(seed)
 
-                with Model() as coarse_model_1:
+                with Model(rng_seeder=rng) as coarse_model_1:
                     if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:
@@ -1693,9 +1695,9 @@ def perform(self, node, inputs, outputs):
 
                     coarse_models.append(coarse_model_1)
 
-                coarse_model_1.default_rng.get_value(borrow=True).seed(seed)
+                rng = np.random.RandomState(seed)
 
-                with Model() as model:
+                with Model(rng_seeder=rng) as model:
                     if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:

From 84f620712af13bb4dd9331eb207c9cbaed5f8a3e Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 31 May 2021 15:37:17 -0500
Subject: [PATCH 2/4] Convert RandomVariables to in-place during graph
 optimization

---
 pymc3/aesaraf.py                    | 14 ++++++++++++++
 pymc3/distributions/distribution.py | 30 ++++++++++++++++++-----------
 pymc3/model.py                      |  5 +++--
 pymc3/sampling.py                   | 19 ++++++++++--------
 pymc3/sampling_jax.py               |  4 ++--
 pymc3/step_methods/metropolis.py    |  5 ++---
 pymc3/tests/test_sampling.py        |  3 ++-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py
index e71a3a3455..bb7becaab8 100644
--- a/pymc3/aesaraf.py
+++ b/pymc3/aesaraf.py
@@ -31,6 +31,7 @@
 import scipy.sparse as sps
 
 from aesara import config, scalar
+from aesara.compile.mode import Mode, get_mode
 from aesara.gradient import grad
 from aesara.graph.basic import (
     Apply,
@@ -861,3 +862,16 @@ def take_along_axis(arr, indices, axis=0):
 
     # use the fancy index
     return arr[_make_along_axis_idx(arr_shape, indices, _axis)]
+
+
+def compile_rv_inplace(inputs, outputs, mode=None, **kwargs):
+    """Use ``aesara.function`` with the random_make_inplace optimization always enabled.
+
+    Using this function ensures that compiled functions containing random
+    variables will produce new samples on each call.
+    """
+    mode = get_mode(mode)
+    opt_qry = mode.provided_optimizer.including("random_make_inplace")
+    mode = Mode(linker=mode.linker, optimizer=opt_qry)
+    aesara_function = aesara.function(inputs, outputs, mode=mode, **kwargs)
+    return aesara_function
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index d6c89bc75f..24b667df1d 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -19,12 +19,12 @@
 import warnings
 
 from abc import ABCMeta
-from copy import copy
 from typing import TYPE_CHECKING
 
 import dill
 
 from aesara.tensor.random.op import RandomVariable
+from aesara.tensor.random.var import RandomStateSharedVariable
 
 from pymc3.distributions import _logcdf, _logp
 
@@ -77,14 +77,6 @@ def _random(*args, **kwargs):
         rv_type = None
 
         if isinstance(rv_op, RandomVariable):
-            if not rv_op.inplace:
-                # TODO: This is a temporary work-around.
-                # Remove this once we know what we want regarding RNG states
-                # and their propagation.
-                rv_op = copy(rv_op)
-                rv_op.inplace = True
-                clsdict["rv_op"] = rv_op
-
             rv_type = type(rv_op)
 
         new_cls = super().__new__(cls, name, bases, clsdict)
@@ -158,15 +150,31 @@ def __new__(cls, name, *args, **kwargs):
         return model.register_rv(rv_out, name, data, total_size, dims=dims, transform=transform)
 
     @classmethod
-    def dist(cls, dist_params, **kwargs):
+    def dist(cls, dist_params, rng=None, **kwargs):
 
         testval = kwargs.pop("testval", None)
 
-        rv_var = cls.rv_op(*dist_params, **kwargs)
+        rv_var = cls.rv_op(*dist_params, rng=rng, **kwargs)
 
         if testval is not None:
             rv_var.tag.test_value = testval
 
+        if (
+            rv_var.owner
+            and isinstance(rv_var.owner.op, RandomVariable)
+            and isinstance(rng, RandomStateSharedVariable)
+            and not getattr(rng, "default_update", None)
+        ):
+            # This tells `aesara.function` that the shared RNG variable
+            # is mutable, which--in turn--tells the `FunctionGraph`
+            # `Supervisor` feature to allow in-place updates on the variable.
+            # Without it, the `RandomVariable`s could not be optimized to allow
+            # in-place RNG updates, forcing all sample results from compiled
+            # functions to be the same on repeated evaluations.
+            new_rng = rv_var.owner.outputs[0]
+            rv_var.update = (rng, new_rng)
+            rng.default_update = new_rng
+
         return rv_var
 
     def _distr_parameters_for_repr(self):
diff --git a/pymc3/model.py b/pymc3/model.py
index 6a5f4dab82..36a46d5a61 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -49,6 +49,7 @@
 
 from pymc3.aesaraf import (
     change_rv_size,
+    compile_rv_inplace,
     gradient,
     hessian,
     inputvars,
@@ -455,7 +456,7 @@ def __init__(
 
         inputs = grad_vars
 
-        self._aesara_function = aesara.function(inputs, outputs, givens=givens, **kwargs)
+        self._aesara_function = compile_rv_inplace(inputs, outputs, givens=givens, **kwargs)
 
     def set_weights(self, values):
         if values.shape != (self._n_costs - 1,):
@@ -1378,7 +1379,7 @@ def makefn(self, outs, mode=None, *args, **kwargs):
         Compiled Aesara function
         """
         with self:
-            return aesara.function(
+            return compile_rv_inplace(
                 self.value_vars,
                 outs,
                 allow_input_downcast=True,
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index cd2020e85e..39ef3ca1e4 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -25,19 +25,19 @@
 from copy import copy, deepcopy
 from typing import Any, Dict, Iterable, List, Optional, Set, Union, cast
 
-import aesara
 import aesara.gradient as tg
 import numpy as np
 import packaging
 import xarray
 
+from aesara.compile.mode import Mode
 from aesara.tensor.sharedvar import SharedVariable
 from arviz import InferenceData
 from fastprogress.fastprogress import progress_bar
 
 import pymc3 as pm
 
-from pymc3.aesaraf import change_rv_size, inputvars, walk_model
+from pymc3.aesaraf import change_rv_size, compile_rv_inplace, inputvars, walk_model
 from pymc3.backends.arviz import _DefaultTrace
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
@@ -1584,6 +1584,7 @@ def sample_posterior_predictive(
     keep_size: Optional[bool] = False,
     random_seed=None,
     progressbar: bool = True,
+    mode: Optional[Union[str, Mode]] = None,
 ) -> Dict[str, np.ndarray]:
     """Generate posterior predictive samples from a model given a trace.
 
@@ -1617,6 +1618,8 @@ def sample_posterior_predictive(
         Whether or not to display a progress bar in the command line. The bar shows the percentage
         of completion, the sampling speed in samples per second (SPS), and the estimated remaining
         time until completion ("expected time of arrival"; ETA).
+    mode:
+        The mode used by ``aesara.function`` to compile the graph.
 
     Returns
     -------
@@ -1727,12 +1730,13 @@ def sample_posterior_predictive(
     if size is not None:
         vars_to_sample = [change_rv_size(v, size, expand=True) for v in vars_to_sample]
 
-    sampler_fn = aesara.function(
+    sampler_fn = compile_rv_inplace(
         inputs,
         vars_to_sample,
         allow_input_downcast=True,
         accept_inplace=True,
         on_unused_input="ignore",
+        mode=mode,
     )
 
     ppc_trace_t = _DefaultTrace(samples)
@@ -1992,12 +1996,11 @@ def sample_prior_predictive(
 
     vars_to_sample = [model[name] for name in names]
     inputs = [i for i in inputvars(vars_to_sample) if not isinstance(i, SharedVariable)]
-    sampler_fn = aesara.function(
-        inputs,
-        vars_to_sample,
-        allow_input_downcast=True,
-        accept_inplace=True,
+
+    sampler_fn = compile_rv_inplace(
+        inputs, vars_to_sample, allow_input_downcast=True, accept_inplace=True, mode=mode
     )
+
     values = zip(*[sampler_fn() for i in range(samples)])
 
     data = {k: np.stack(v) for k, v in zip(names, values)}
diff --git a/pymc3/sampling_jax.py b/pymc3/sampling_jax.py
index 6168de34a1..5ce4dae707 100644
--- a/pymc3/sampling_jax.py
+++ b/pymc3/sampling_jax.py
@@ -7,7 +7,6 @@
 xla_flags = re.sub(r"xla_force_host_platform_device_count=.+\s", "", xla_flags).split()
 os.environ["XLA_FLAGS"] = " ".join(["--xla_force_host_platform_device_count={}".format(100)])
 
-import aesara.graph.fg
 import aesara.tensor as at
 import arviz as az
 import jax
@@ -23,6 +22,7 @@
 from aesara.tensor.type import TensorType
 
 from pymc3 import modelcontext
+from pymc3.aesaraf import compile_rv_inplace
 
 warnings.warn("This module is experimental.")
 
@@ -209,7 +209,7 @@ def sample_numpyro_nuts(
     print("Compiling...")
 
     tic1 = pd.Timestamp.now()
-    _sample = aesara.function(
+    _sample = compile_rv_inplace(
         [],
         sample_outputs + [numpyro_samples[-1]],
         allow_input_downcast=True,
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index c9cb58470f..24b88f7ee8 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -13,7 +13,6 @@
 #   limitations under the License.
 from typing import Any, Callable, Dict, List, Tuple
 
-import aesara
 import numpy as np
 import numpy.random as nr
 import scipy.linalg
@@ -23,7 +22,7 @@
 
 import pymc3 as pm
 
-from pymc3.aesaraf import floatX, rvs_to_value_vars
+from pymc3.aesaraf import compile_rv_inplace, floatX, rvs_to_value_vars
 from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.step_methods.arraystep import (
     ArrayStep,
@@ -985,6 +984,6 @@ def delta_logp(point, logp, vars, shared):
 
     logp1 = pm.CallableTensor(logp0)(inarray1)
 
-    f = aesara.function([inarray1, inarray0], logp1 - logp0)
+    f = compile_rv_inplace([inarray1, inarray0], logp1 - logp0)
     f.trust_input = True
     return f
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 4bc7a7a99f..8756138c15 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -31,6 +31,7 @@
 
 import pymc3 as pm
 
+from pymc3.aesaraf import compile_rv_inplace
 from pymc3.backends.ndarray import NDArray
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.tests.helpers import SeededTest
@@ -973,7 +974,7 @@ def test_layers(self):
             a = pm.Uniform("a", lower=0, upper=1, size=10)
             b = pm.Binomial("b", n=1, p=a, size=10)
 
-        b_sampler = aesara.function([], b)
+        b_sampler = compile_rv_inplace([], b, mode="FAST_RUN")
         avg = np.stack([b_sampler() for i in range(10000)]).mean(0)
         npt.assert_array_almost_equal(avg, 0.5 * np.ones((10,)), decimal=2)
 

From aa22bccf2ccd805aa19795f9cf67b78461be38ae Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 31 May 2021 19:13:22 -0500
Subject: [PATCH 3/4] Introduce Model.initial_values and deprecate testval in
 favor of initval

---
 RELEASE-NOTES.md                         |  1 +
 pymc3/distributions/continuous.py        | 20 ++++---
 pymc3/distributions/distribution.py      | 31 ++++++++--
 pymc3/model.py                           | 73 +++++++++++++-----------
 pymc3/tests/test_distributions.py        | 25 +++++++-
 pymc3/tests/test_distributions_random.py | 12 ++--
 pymc3/tests/test_logp.py                 |  9 ++-
 pymc3/tests/test_model.py                | 20 ++++++-
 8 files changed, 135 insertions(+), 56 deletions(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index cc4522f383..3bdf6272dc 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -5,6 +5,7 @@
 - ⚠ Theano-PyMC has been replaced with Aesara, so all external references to `theano`, `tt`, and `pymc3.theanof` need to be replaced with `aesara`, `at`, and `pymc3.aesaraf` (see [4471](https://github.com/pymc-devs/pymc3/pull/4471)).
 - ArviZ `plots` and `stats` *wrappers* were removed. The functions are now just available by their original names (see [#4549](https://github.com/pymc-devs/pymc3/pull/4471) and `3.11.2` release notes).
 - The GLM submodule has been removed, please use [Bambi](https://bambinos.github.io/bambi/) instead.
+- The `Distribution` keyword argument `testval` has been deprecated in favor of `initval`.
 - ...
 
 ### New Features
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index cf68949121..8089cee270 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -332,10 +332,12 @@ class Flat(Continuous):
     rv_op = flat
 
     @classmethod
-    def dist(cls, *, size=None, testval=None, **kwargs):
-        if testval is None:
-            testval = np.full(size, floatX(0.0))
-        return super().dist([], size=size, testval=testval, **kwargs)
+    def dist(cls, *, size=None, initval=None, **kwargs):
+        if initval is None:
+            initval = np.full(size, floatX(0.0))
+        res = super().dist([], size=size, **kwargs)
+        res.tag.test_value = initval
+        return res
 
     def logp(value):
         """
@@ -394,10 +396,12 @@ class HalfFlat(PositiveContinuous):
     rv_op = halfflat
 
     @classmethod
-    def dist(cls, *, size=None, testval=None, **kwargs):
-        if testval is None:
-            testval = np.full(size, floatX(1.0))
-        return super().dist([], size=size, testval=testval, **kwargs)
+    def dist(cls, *, size=None, initval=None, **kwargs):
+        if initval is None:
+            initval = np.full(size, floatX(1.0))
+        res = super().dist([], size=size, **kwargs)
+        res.tag.test_value = initval
+        return res
 
     def logp(value):
         """
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 24b667df1d..eb58573e6c 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -143,21 +143,44 @@ def __new__(cls, name, *args, **kwargs):
         if "shape" in kwargs:
             raise DeprecationWarning("The `shape` keyword is deprecated; use `size`.")
 
+        testval = kwargs.pop("testval", None)
+
+        if testval is not None:
+            warnings.warn(
+                "The `testval` argument is deprecated; use `initval`.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        initval = kwargs.pop("initval", testval)
+
         transform = kwargs.pop("transform", UNSET)
 
         rv_out = cls.dist(*args, rng=rng, **kwargs)
 
-        return model.register_rv(rv_out, name, data, total_size, dims=dims, transform=transform)
+        if testval is not None:
+            rv_out.tag.test_value = testval
+
+        return model.register_rv(
+            rv_out, name, data, total_size, dims=dims, transform=transform, initval=initval
+        )
 
     @classmethod
     def dist(cls, dist_params, rng=None, **kwargs):
 
         testval = kwargs.pop("testval", None)
 
-        rv_var = cls.rv_op(*dist_params, rng=rng, **kwargs)
-
         if testval is not None:
-            rv_var.tag.test_value = testval
+            warnings.warn(
+                "The `testval` argument is deprecated. "
+                "Use `initval` to set initial values for a `Model`; "
+                "otherwise, set test values on Aesara parameters explicitly "
+                "when attempting to use Aesara's test value debugging features.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        rv_var = cls.rv_op(*dist_params, rng=rng, **kwargs)
 
         if (
             rv_var.owner
diff --git a/pymc3/model.py b/pymc3/model.py
index 36a46d5a61..09018bb22d 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -40,7 +40,7 @@
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.gradient import grad
 from aesara.graph.basic import Constant, Variable, graph_inputs
-from aesara.graph.fg import FunctionGraph, MissingInputError
+from aesara.graph.fg import FunctionGraph
 from aesara.tensor.random.opt import local_subtensor_rv_lift
 from aesara.tensor.random.var import RandomStateSharedVariable
 from aesara.tensor.sharedvar import ScalarSharedVariable
@@ -572,7 +572,7 @@ def __init__(self, mean=0, sigma=1, name='', model=None):
                 Normal('v2', mu=mean, sigma=sd)
 
                 # something more complex is allowed, too
-                half_cauchy = HalfCauchy('sd', beta=10, testval=1.)
+                half_cauchy = HalfCauchy('sd', beta=10, initval=1.)
                 Normal('v3', mu=mean, sigma=half_cauchy)
 
                 # Deterministic variables can be used in usual way
@@ -649,6 +649,7 @@ def __init__(
 
         # The sequence of model-generated RNGs
         self.rng_seq = []
+        self.initial_values = {}
 
         if self.parent is not None:
             self.named_vars = treedict(parent=self.parent.named_vars)
@@ -914,35 +915,7 @@ def test_point(self):
 
     @property
     def initial_point(self):
-        points = []
-        for rv_var in self.free_RVs:
-            value_var = rv_var.tag.value_var
-            var_value = getattr(value_var.tag, "test_value", None)
-
-            if var_value is None:
-
-                rv_var_value = getattr(rv_var.tag, "test_value", None)
-
-                if rv_var_value is None:
-                    try:
-                        rv_var_value = rv_var.eval()
-                    except MissingInputError:
-                        raise MissingInputError(f"Couldn't generate an initial value for {rv_var}")
-
-                transform = getattr(value_var.tag, "transform", None)
-
-                if transform:
-                    try:
-                        rv_var_value = transform.forward(rv_var, rv_var_value).eval()
-                    except MissingInputError:
-                        raise MissingInputError(f"Couldn't generate an initial value for {rv_var}")
-
-                var_value = rv_var_value
-                value_var.tag.test_value = var_value
-
-            points.append((value_var, var_value))
-
-        return Point(points, model=self)
+        return Point(list(self.initial_values.items()), model=self)
 
     @property
     def disc_vars(self):
@@ -954,6 +927,37 @@ def cont_vars(self):
         """All the continuous variables in the model"""
         return list(typefilter(self.value_vars, continuous_types))
 
+    def set_initval(self, rv_var, initval):
+        initval = (
+            rv_var.type.filter(initval)
+            if initval is not None
+            else getattr(rv_var.tag, "test_value", None)
+        )
+
+        rv_value_var = self.rvs_to_values[rv_var]
+        transform = getattr(rv_value_var.tag, "transform", None)
+
+        if initval is None or transform:
+            # Sample/evaluate this using the existing initial values, and
+            # with the least amount of affect on the RNGs involved (i.e. no
+            # in-placing)
+            from aesara.compile.mode import Mode, get_mode
+
+            mode = get_mode(None)
+            opt_qry = mode.provided_optimizer.excluding("random_make_inplace")
+            mode = Mode(linker=mode.linker, optimizer=opt_qry)
+
+            if transform:
+                value = initval if initval is not None else rv_var
+                rv_var = transform.forward(rv_var, value)
+
+            initval_fn = aesara.function(
+                [], rv_var, mode=mode, givens=self.initial_values, on_unused_input="ignore"
+            )
+            initval = initval_fn()
+
+        self.initial_values[rv_value_var] = initval
+
     def next_rng(self) -> RandomStateSharedVariable:
         """Generate a new ``RandomStateSharedVariable``.
 
@@ -1116,7 +1120,9 @@ def set_data(
 
         shared_object.set_value(values)
 
-    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=UNSET):
+    def register_rv(
+        self, rv_var, name, data=None, total_size=None, dims=None, transform=UNSET, initval=None
+    ):
         """Register an (un)observed random variable with the model.
 
         Parameters
@@ -1132,6 +1138,8 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
             Dimension names for the variable.
         transform
             A transform for the random variable in log-likelihood space.
+        initval
+            The initial value of the random variable.
 
         Returns
         -------
@@ -1145,6 +1153,7 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
             self.free_RVs.append(rv_var)
             self.create_value_var(rv_var, transform)
             self.add_random_variable(rv_var, dims)
+            self.set_initval(rv_var, initval)
         else:
             if (
                 isinstance(data, Variable)
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 832e61381e..12f08eab32 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -594,6 +594,7 @@ def check_logp(
         n_samples=100,
         extra_args=None,
         scipy_args=None,
+        skip_params_fn=lambda x: False,
     ):
         """
         Generic test for PyMC3 logp methods
@@ -625,6 +626,9 @@ def check_logp(
             the pymc3 distribution logp is calculated
         scipy_args : Dictionary with extra arguments needed to call scipy logp method
             Usually the same as extra_args
+        skip_params_fn: Callable
+            A function that takes a ``dict`` of the test points and returns a
+            boolean indicating whether or not to perform the test.
         """
         if decimal is None:
             decimal = select_by_precision(float64=6, float32=3)
@@ -646,6 +650,8 @@ def logp_reference(args):
         domains["value"] = domain
         for pt in product(domains, n_samples=n_samples):
             pt = dict(pt)
+            if skip_params_fn(pt):
+                continue
             pt_d = self._model_input_dict(model, param_vars, pt)
             pt_logp = Point(pt_d, model=model)
             pt_ref = Point(pt, filter_model_vars=False, model=model)
@@ -690,6 +696,7 @@ def check_logcdf(
         n_samples=100,
         skip_paramdomain_inside_edge_test=False,
         skip_paramdomain_outside_edge_test=False,
+        skip_params_fn=lambda x: False,
     ):
         """
         Generic test for PyMC3 logcdf methods
@@ -730,6 +737,9 @@ def check_logcdf(
         skip_paramdomain_outside_edge_test : Bool
             Whether to run test 2., which checks that pymc3 distribution logcdf
             returns -inf for invalid parameter values outside the supported domain edge
+        skip_params_fn: Callable
+            A function that takes a ``dict`` of the test points and returns a
+            boolean indicating whether or not to perform the test.
 
         Returns
         -------
@@ -745,6 +755,8 @@ def check_logcdf(
 
             for pt in product(domains, n_samples=n_samples):
                 params = dict(pt)
+                if skip_params_fn(params):
+                    continue
                 scipy_cdf = scipy_logcdf(**params)
                 value = params.pop("value")
                 with Model() as m:
@@ -825,7 +837,13 @@ def check_logcdf(
                 )
 
     def check_selfconsistency_discrete_logcdf(
-        self, distribution, domain, paramdomains, decimal=None, n_samples=100
+        self,
+        distribution,
+        domain,
+        paramdomains,
+        decimal=None,
+        n_samples=100,
+        skip_params_fn=lambda x: False,
     ):
         """
         Check that logcdf of discrete distributions matches sum of logps up to value
@@ -836,6 +854,8 @@ def check_selfconsistency_discrete_logcdf(
             decimal = select_by_precision(float64=6, float32=3)
         for pt in product(domains, n_samples=n_samples):
             params = dict(pt)
+            if skip_params_fn(params):
+                continue
             value = params.pop("value")
             values = np.arange(domain.lower, value + 1)
             dist = distribution.dist(**params)
@@ -1187,17 +1207,20 @@ def modified_scipy_hypergeom_logcdf(value, N, k, n):
             Nat,
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
             modified_scipy_hypergeom_logpmf,
+            skip_params_fn=lambda x: x["N"] < x["n"] or x["N"] < x["k"],
         )
         self.check_logcdf(
             HyperGeometric,
             Nat,
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
             modified_scipy_hypergeom_logcdf,
+            skip_params_fn=lambda x: x["N"] < x["n"] or x["N"] < x["k"],
         )
         self.check_selfconsistency_discrete_logcdf(
             HyperGeometric,
             Nat,
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
+            skip_params_fn=lambda x: x["N"] < x["n"] or x["N"] < x["k"],
         )
 
     def test_negative_binomial(self):
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 88e56aa480..18a864cb11 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -327,17 +327,13 @@ def test_distribution(self):
 
     def _instantiate_pymc_rv(self, dist_params=None):
         params = dist_params if dist_params else self.pymc_dist_params
-        with pm.Model():
-            self.pymc_rv = self.pymc_dist(
-                **params,
-                size=self.size,
-                rng=aesara.shared(self.get_random_state(reset=True)),
-                name=f"{self.pymc_dist.rv_op.name}_test",
-            )
+        self.pymc_rv = self.pymc_dist.dist(
+            **params, size=self.size, rng=aesara.shared(self.get_random_state(reset=True))
+        )
 
     def check_pymc_draws_match_reference(self):
         # need to re-instantiate it to make sure that the order of drawings match the reference distribution one
-        self._instantiate_pymc_rv()
+        # self._instantiate_pymc_rv()
         assert_array_almost_equal(
             self.pymc_rv.eval(), self.reference_dist_draws, decimal=self.decimal
         )
diff --git a/pymc3/tests/test_logp.py b/pymc3/tests/test_logp.py
index aea9db1fdc..6047820292 100644
--- a/pymc3/tests/test_logp.py
+++ b/pymc3/tests/test_logp.py
@@ -18,7 +18,7 @@
 import scipy.stats.distributions as sp
 
 from aesara.gradient import DisconnectedGrad
-from aesara.graph.basic import Constant, graph_inputs
+from aesara.graph.basic import Constant, ancestors, graph_inputs
 from aesara.graph.fg import FunctionGraph
 from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.subtensor import (
@@ -38,6 +38,11 @@
 from pymc3.tests.helpers import select_by_precision
 
 
+def assert_no_rvs(var):
+    assert not any(isinstance(v.owner.op, RandomVariable) for v in ancestors([var]) if v.owner)
+    return var
+
+
 def test_logpt_basic():
     """Make sure we can compute a log-likelihood for a hierarchical model with transforms."""
 
@@ -171,7 +176,7 @@ def test_logpt_subtensor():
     logp_vals_fn = aesara.function([A_idx_value_var, I_value_var], A_idx_logp)
 
     # The compiled graph should not contain any `RandomVariables`
-    assert not any(isinstance(n.op, RandomVariable) for n in logp_vals_fn.maker.fgraph.apply_nodes)
+    assert_no_rvs(logp_vals_fn.maker.fgraph.outputs[0])
 
     decimals = select_by_precision(float64=6, float32=4)
 
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index d479c98f32..772198908f 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -498,13 +498,31 @@ def test_initial_point():
 
     with pm.Model() as model:
         a = pm.Uniform("a")
-        pm.Normal("x", a)
+        x = pm.Normal("x", a)
 
     with pytest.warns(DeprecationWarning):
         initial_point = model.test_point
 
     assert all(var.name in initial_point for var in model.value_vars)
 
+    b_initval = np.array(0.3, dtype=aesara.config.floatX)
+
+    with pytest.warns(DeprecationWarning), model:
+        b = pm.Uniform("b", testval=b_initval)
+
+    b_value_var = model.rvs_to_values[b]
+    b_initval_trans = b_value_var.tag.transform.forward(b, b_initval).eval()
+
+    y_initval = np.array(-2.4, dtype=aesara.config.floatX)
+
+    with model:
+        y = pm.Normal("y", initval=y_initval)
+
+    assert model.rvs_to_values[a] in model.initial_values
+    assert model.rvs_to_values[x] in model.initial_values
+    assert model.initial_values[b_value_var] == b_initval_trans
+    assert model.initial_values[model.rvs_to_values[y]] == y_initval
+
 
 def test_point_logps():
 

From b1f95e52638b028a4dde9bed5ecbd43b620b929f Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 31 May 2021 19:15:46 -0500
Subject: [PATCH 4/4] Replace uses of testval with initval

---
 pymc3/distributions/bart.py                  |  2 +-
 pymc3/distributions/bound.py                 |  8 ++--
 pymc3/distributions/distribution.py          | 12 +++---
 pymc3/distributions/mixture.py               |  4 +-
 pymc3/distributions/multivariate.py          | 18 ++++-----
 pymc3/tests/models.py                        | 18 ++++-----
 pymc3/tests/test_distributions_timeseries.py | 10 ++---
 pymc3/tests/test_model.py                    |  4 +-
 pymc3/tests/test_sampling.py                 | 20 +++++-----
 pymc3/tests/test_step.py                     |  8 ++--
 pymc3/tests/test_transforms.py               | 40 ++++++++++----------
 pymc3/tests/test_types.py                    | 12 +++---
 pymc3/tests/test_variational_inference.py    |  2 +-
 13 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index 4914844555..69c89fea1f 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -27,7 +27,7 @@ def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None, *args, **kwargs):
 
         self.X, self.Y, self.missing_data = self.preprocess_XY(X, Y)
 
-        super().__init__(shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs)
+        super().__init__(shape=X.shape[0], dtype="float64", initval=0, *args, **kwargs)
 
         if self.X.ndim != 2:
             raise ValueError("The design matrix X must have two dimensions")
diff --git a/pymc3/distributions/bound.py b/pymc3/distributions/bound.py
index bc0e168f38..bbb19d5065 100644
--- a/pymc3/distributions/bound.py
+++ b/pymc3/distributions/bound.py
@@ -42,7 +42,7 @@ def __init__(self, distribution, lower, upper, default, *args, **kwargs):
         super().__init__(
             shape=self._wrapped.shape,
             dtype=self._wrapped.dtype,
-            testval=self._wrapped.testval,
+            initval=self._wrapped.initval,
             defaults=defaults,
             transform=self._wrapped.transform,
         )
@@ -252,15 +252,15 @@ class Bound:
 
         with pm.Model():
             NegativeNormal = pm.Bound(pm.Normal, upper=0.0)
-            par1 = NegativeNormal('par`', mu=0.0, sigma=1.0, testval=-0.5)
+            par1 = NegativeNormal('par`', mu=0.0, sigma=1.0, initval=-0.5)
             # you can use the Bound object multiple times to
             # create multiple bounded random variables
-            par1_1 = NegativeNormal('par1_1', mu=-1.0, sigma=1.0, testval=-1.5)
+            par1_1 = NegativeNormal('par1_1', mu=-1.0, sigma=1.0, initval=-1.5)
 
             # you can also define a Bound implicitly, while applying
             # it to a random variable
             par2 = pm.Bound(pm.Normal, lower=-1.0, upper=1.0)(
-                    'par2', mu=0.0, sigma=1.0, testval=1.0)
+                    'par2', mu=0.0, sigma=1.0, initval=1.0)
     """
 
     def __init__(self, distribution, lower=None, upper=None):
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index eb58573e6c..41954e83dd 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -277,14 +277,14 @@ def __init__(
         self,
         shape,
         dtype,
-        testval=None,
+        initval=None,
         defaults=(),
         parent_dist=None,
         *args,
         **kwargs,
     ):
         super().__init__(
-            shape=shape, dtype=dtype, testval=testval, defaults=defaults, *args, **kwargs
+            shape=shape, dtype=dtype, initval=initval, defaults=defaults, *args, **kwargs
         )
         self.parent_dist = parent_dist
 
@@ -342,7 +342,7 @@ def __init__(
         logp,
         shape=(),
         dtype=None,
-        testval=0,
+        initval=0,
         random=None,
         wrap_random_with_dist_shape=True,
         check_shape_in_random=True,
@@ -363,8 +363,8 @@ def __init__(
             a value here.
         dtype: None, str (Optional)
             The dtype of the distribution.
-        testval: number or array (Optional)
-            The ``testval`` of the RV's tensor that follow the ``DensityDist``
+        initval: number or array (Optional)
+            The ``initval`` of the RV's tensor that follow the ``DensityDist``
             distribution.
         args, kwargs: (Optional)
             These are passed to the parent class' ``__init__``.
@@ -400,7 +400,7 @@ def __init__(
         """
         if dtype is None:
             dtype = aesara.config.floatX
-        super().__init__(shape, dtype, testval, *args, **kwargs)
+        super().__init__(shape, dtype, initval, *args, **kwargs)
         self.logp = logp
         if type(self.logp) == types.MethodType:
             if PLATFORM != "linux":
diff --git a/pymc3/distributions/mixture.py b/pymc3/distributions/mixture.py
index 3d82436f7d..a462f81e2d 100644
--- a/pymc3/distributions/mixture.py
+++ b/pymc3/distributions/mixture.py
@@ -609,7 +609,7 @@ class NormalMixture(Mixture):
                 10,
                 shape=n_components,
                 transform=pm.transforms.ordered,
-                testval=[1, 2, 3],
+                initval=[1, 2, 3],
             )
             σ = pm.HalfNormal("σ", 10, shape=n_components)
             weights = pm.Dirichlet("w", np.ones(n_components))
@@ -684,7 +684,7 @@ def __init__(self, w, comp_dists, mixture_axis=-1, *args, **kwargs):
         self.mixture_axis = mixture_axis
         kwargs.setdefault("dtype", self.comp_dists.dtype)
 
-        # Compute the mode so we don't always have to pass a testval
+        # Compute the mode so we don't always have to pass a initval
         defaults = kwargs.pop("defaults", [])
         event_shape = self.comp_dists.shape[mixture_axis + 1 :]
         _w = at.shape_padleft(
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 4eb6b01817..7cea4a90e2 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -840,7 +840,7 @@ def logp(self, X):
         )
 
 
-def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testval=None):
+def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, initval=None):
     R"""
     Bartlett decomposition of the Wishart distribution. As the Wishart
     distribution requires the matrix to be symmetric positive semi-definite
@@ -875,7 +875,7 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testv
         Input matrix S is already Cholesky decomposed as S.T * S
     return_cholesky: bool (default=False)
         Only return the Cholesky decomposed matrix.
-    testval: ndarray
+    initval: ndarray
         p x p positive definite matrix used to initialize
 
     Notes
@@ -894,21 +894,21 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testv
     n_diag = len(diag_idx[0])
     n_tril = len(tril_idx[0])
 
-    if testval is not None:
+    if initval is not None:
         # Inverse transform
-        testval = np.dot(np.dot(np.linalg.inv(L), testval), np.linalg.inv(L.T))
-        testval = linalg.cholesky(testval, lower=True)
-        diag_testval = testval[diag_idx] ** 2
-        tril_testval = testval[tril_idx]
+        initval = np.dot(np.dot(np.linalg.inv(L), initval), np.linalg.inv(L.T))
+        initval = linalg.cholesky(initval, lower=True)
+        diag_testval = initval[diag_idx] ** 2
+        tril_testval = initval[tril_idx]
     else:
         diag_testval = None
         tril_testval = None
 
     c = at.sqrt(
-        ChiSquared("%s_c" % name, nu - np.arange(2, 2 + n_diag), shape=n_diag, testval=diag_testval)
+        ChiSquared("%s_c" % name, nu - np.arange(2, 2 + n_diag), shape=n_diag, initval=diag_testval)
     )
     pm._log.info("Added new variable %s_c to model diagonal of Wishart." % name)
-    z = Normal("%s_z" % name, 0.0, 1.0, shape=n_tril, testval=tril_testval)
+    z = Normal("%s_z" % name, 0.0, 1.0, shape=n_tril, initval=tril_testval)
     pm._log.info("Added new variable %s_z to model off-diagonals of Wishart." % name)
     # Construct A matrix
     A = at.zeros(S.shape, dtype=np.float32)
diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index 0289386e54..78324e72c7 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -30,7 +30,7 @@ def simple_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, size=2, testval=floatX_array([0.1, 0.1]))
+        Normal("x", mu, tau=tau, size=2, initval=floatX_array([0.1, 0.1]))
 
     return model.initial_point, model, (mu, tau ** -0.5)
 
@@ -39,7 +39,7 @@ def simple_categorical():
     p = floatX_array([0.1, 0.2, 0.3, 0.4])
     v = floatX_array([0.0, 1.0, 2.0, 3.0])
     with Model() as model:
-        Categorical("x", p, size=3, testval=[1, 2, 3])
+        Categorical("x", p, size=3, initval=[1, 2, 3])
 
     mu = np.dot(p, v)
     var = np.dot(p, (v - mu) ** 2)
@@ -50,7 +50,7 @@ def multidimensional_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, size=(3, 2), testval=0.1 * np.ones((3, 2)))
+        Normal("x", mu, tau=tau, size=(3, 2), initval=0.1 * np.ones((3, 2)))
 
     return model.initial_point, model, (mu, tau ** -0.5)
 
@@ -81,7 +81,7 @@ def simple_2model():
     tau = 1.3
     p = 0.4
     with Model() as model:
-        x = pm.Normal("x", mu, tau=tau, testval=0.1)
+        x = pm.Normal("x", mu, tau=tau, initval=0.1)
         pm.Deterministic("logx", at.log(x))
         pm.Bernoulli("y", p)
     return model.initial_point, model
@@ -91,7 +91,7 @@ def simple_2model_continuous():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        x = pm.Normal("x", mu, tau=tau, testval=0.1)
+        x = pm.Normal("x", mu, tau=tau, initval=0.1)
         pm.Deterministic("logx", at.log(x))
         pm.Beta("y", alpha=1, beta=1, size=2)
     return model.initial_point, model
@@ -106,7 +106,7 @@ def mv_simple():
             "x",
             at.constant(mu),
             tau=at.constant(tau),
-            testval=floatX_array([0.1, 1.0, 0.8]),
+            initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
     C = np.linalg.inv(H)
@@ -122,7 +122,7 @@ def mv_simple_coarse():
             "x",
             at.constant(mu),
             tau=at.constant(tau),
-            testval=floatX_array([0.1, 1.0, 0.8]),
+            initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
     C = np.linalg.inv(H)
@@ -138,7 +138,7 @@ def mv_simple_very_coarse():
             "x",
             at.constant(mu),
             tau=at.constant(tau),
-            testval=floatX_array([0.1, 1.0, 0.8]),
+            initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
     C = np.linalg.inv(H)
@@ -150,7 +150,7 @@ def mv_simple_discrete():
     n = 5
     p = floatX_array([0.15, 0.85])
     with pm.Model() as model:
-        pm.Multinomial("x", n, at.constant(p), testval=np.array([1, 4]))
+        pm.Multinomial("x", n, at.constant(p), initval=np.array([1, 4]))
         mu = n * p
         # covariance matrix
         C = np.zeros((d, d))
diff --git a/pymc3/tests/test_distributions_timeseries.py b/pymc3/tests/test_distributions_timeseries.py
index 4f55d90214..961644e6d4 100644
--- a/pymc3/tests/test_distributions_timeseries.py
+++ b/pymc3/tests/test_distributions_timeseries.py
@@ -68,13 +68,13 @@ def test_AR_nd():
     beta_tp = np.random.randn(p, n)
     y_tp = np.random.randn(T, n)
     with Model() as t0:
-        beta = Normal("beta", 0.0, 1.0, shape=(p, n), testval=beta_tp)
-        AR("y", beta, sigma=1.0, shape=(T, n), testval=y_tp)
+        beta = Normal("beta", 0.0, 1.0, shape=(p, n), initval=beta_tp)
+        AR("y", beta, sigma=1.0, shape=(T, n), initval=y_tp)
 
     with Model() as t1:
-        beta = Normal("beta", 0.0, 1.0, shape=(p, n), testval=beta_tp)
+        beta = Normal("beta", 0.0, 1.0, shape=(p, n), initval=beta_tp)
         for i in range(n):
-            AR("y_%d" % i, beta[:, i], sigma=1.0, shape=T, testval=y_tp[:, i])
+            AR("y_%d" % i, beta[:, i], sigma=1.0, shape=T, initval=y_tp[:, i])
 
     np.testing.assert_allclose(t0.logp(t0.initial_point), t1.logp(t1.initial_point))
 
@@ -150,7 +150,7 @@ def test_linear():
     # build model
     with Model() as model:
         lamh = Flat("lamh")
-        xh = EulerMaruyama("xh", dt, sde, (lamh,), shape=N + 1, testval=x)
+        xh = EulerMaruyama("xh", dt, sde, (lamh,), shape=N + 1, initval=x)
         Normal("zh", mu=xh, sigma=sig2, observed=z)
     # invert
     with model:
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 772198908f..53ab66af21 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -57,7 +57,7 @@ def __init__(self, mean=0, sigma=1, name="", model=None):
         super().__init__(name, model)
         self.register_rv(Normal.dist(mu=mean, sigma=sigma), "v1")
         Normal("v2", mu=mean, sigma=sigma)
-        Normal("v3", mu=mean, sigma=Normal("sd", mu=10, sigma=1, testval=1.0))
+        Normal("v3", mu=mean, sigma=Normal("sd", mu=10, sigma=1, initval=1.0))
         Deterministic("v3_sq", self.v3 ** 2)
         Potential("p1", at.constant(1))
 
@@ -462,7 +462,7 @@ def test_make_obs_var():
     fake_model = pm.Model()
     with fake_model:
         fake_distribution = pm.Normal.dist(mu=0, sigma=1)
-        # Create the testval attribute simply for the sake of model testing
+        # Create the initval attribute simply for the sake of model testing
         fake_distribution.name = input_name
 
     # Check function behavior using the various inputs
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 8756138c15..9b0a39602a 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -387,7 +387,7 @@ def test_shared_named(self):
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
                 size=(1, 1),
-                testval=np.atleast_2d(0),
+                initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
                 "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
@@ -403,7 +403,7 @@ def test_shared_unnamed(self):
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
                 size=(1, 1),
-                testval=np.atleast_2d(0),
+                initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
                 "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
@@ -419,7 +419,7 @@ def test_constant_named(self):
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
                 size=(1, 1),
-                testval=np.atleast_2d(0),
+                initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
                 "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
@@ -688,10 +688,10 @@ def test_deterministic_of_observed_modified_interface(self):
         meas_in_1 = pm.aesaraf.floatX(2 + 4 * rng.randn(100))
         meas_in_2 = pm.aesaraf.floatX(5 + 4 * rng.randn(100))
         with pm.Model(rng_seeder=rng) as model:
-            mu_in_1 = pm.Normal("mu_in_1", 0, 1, testval=0)
-            sigma_in_1 = pm.HalfNormal("sd_in_1", 1, testval=1)
-            mu_in_2 = pm.Normal("mu_in_2", 0, 1, testval=0)
-            sigma_in_2 = pm.HalfNormal("sd__in_2", 1, testval=1)
+            mu_in_1 = pm.Normal("mu_in_1", 0, 1, initval=0)
+            sigma_in_1 = pm.HalfNormal("sd_in_1", 1, initval=1)
+            mu_in_2 = pm.Normal("mu_in_2", 0, 1, initval=0)
+            sigma_in_2 = pm.HalfNormal("sd__in_2", 1, initval=1)
 
             in_1 = pm.Normal("in_1", mu_in_1, sigma_in_1, observed=meas_in_1)
             in_2 = pm.Normal("in_2", mu_in_2, sigma_in_2, observed=meas_in_2)
@@ -882,7 +882,7 @@ def _mocked_init_nuts(*args, **kwargs):
 
 
 @pytest.mark.parametrize(
-    "testval, jitter_max_retries, expectation",
+    "initval, jitter_max_retries, expectation",
     [
         (0, 0, pytest.raises(SamplingError)),
         (0, 1, pytest.raises(SamplingError)),
@@ -891,9 +891,9 @@ def _mocked_init_nuts(*args, **kwargs):
         (1, 0, does_not_raise()),
     ],
 )
-def test_init_jitter(testval, jitter_max_retries, expectation):
+def test_init_jitter(initval, jitter_max_retries, expectation):
     with pm.Model() as m:
-        pm.HalfNormal("x", transform=None, testval=testval)
+        pm.HalfNormal("x", transform=None, initval=initval)
 
     with expectation:
         # Starting value is negative (invalid) when np.random.rand returns 0 (jitter = -1)
diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index 7f4796755c..1daf0e1c57 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -964,7 +964,7 @@ def test_multiple_samplers(self, caplog):
 
     def test_bad_init_nonparallel(self):
         with Model():
-            HalfNormal("a", sigma=1, testval=-1, transform=None)
+            HalfNormal("a", sigma=1, initval=-1, transform=None)
             with pytest.raises(SamplingError) as error:
                 sample(init=None, chains=1, random_seed=1)
             error.match("Initial evaluation")
@@ -972,17 +972,17 @@ def test_bad_init_nonparallel(self):
     @pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher")
     def test_bad_init_parallel(self):
         with Model():
-            HalfNormal("a", sigma=1, testval=-1, transform=None)
+            HalfNormal("a", sigma=1, initval=-1, transform=None)
             with pytest.raises(SamplingError) as error:
                 sample(init=None, cores=2, random_seed=1)
             error.match("Initial evaluation")
 
     def test_linalg(self, caplog):
         with Model():
-            a = Normal("a", size=2, testval=floatX(np.zeros(2)))
+            a = Normal("a", size=2, initval=floatX(np.zeros(2)))
             a = at.switch(a > 0, np.inf, a)
             b = at.slinalg.solve(floatX(np.eye(2)), a)
-            Normal("c", mu=b, size=2, testval=floatX(np.r_[0.0, 0.0]))
+            Normal("c", mu=b, size=2, initval=floatX(np.r_[0.0, 0.0]))
             caplog.clear()
             trace = sample(20, init=None, tune=5, chains=2)
             warns = [msg.msg for msg in caplog.records]
diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index 2c0265aabe..280471a09e 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -227,7 +227,7 @@ def test_interval_near_boundary():
     x0 = np.nextafter(ub, lb)
 
     with pm.Model() as model:
-        pm.Uniform("x", testval=x0, lower=lb, upper=ub)
+        pm.Uniform("x", initval=x0, lower=lb, upper=ub)
 
     log_prob = model.point_logps()
     np.testing.assert_allclose(log_prob, np.array([-52.68]))
@@ -274,11 +274,11 @@ def test_chain_jacob_det():
 
 
 class TestElementWiseLogp(SeededTest):
-    def build_model(self, distfam, params, size, transform, testval=None):
-        if testval is not None:
-            testval = pm.floatX(testval)
+    def build_model(self, distfam, params, size, transform, initval=None):
+        if initval is not None:
+            initval = pm.floatX(initval)
         with pm.Model() as m:
-            distfam("x", size=size, transform=transform, testval=testval, **params)
+            distfam("x", size=size, transform=transform, initval=initval, **params)
         return m
 
     def check_transform_elementwise_logp(self, model):
@@ -408,7 +408,7 @@ def test_normal_ordered(self):
             pm.Normal,
             {"mu": 0.0, "sd": 1.0},
             size=3,
-            testval=np.asarray([-1.0, 1.0, 4.0]),
+            initval=np.asarray([-1.0, 1.0, 4.0]),
             transform=tr.ordered,
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=0)
@@ -422,24 +422,24 @@ def test_normal_ordered(self):
     )
     @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_half_normal_ordered(self, sd, size):
-        testval = np.sort(np.abs(np.random.randn(*size)))
+        initval = np.sort(np.abs(np.random.randn(*size)))
         model = self.build_model(
             pm.HalfNormal,
             {"sd": sd},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=0)
 
     @pytest.mark.parametrize("lam,size", [(2.5, (2,)), (np.ones(3), (4, 3))])
     def test_exponential_ordered(self, lam, size):
-        testval = np.sort(np.abs(np.random.randn(*size)))
+        initval = np.sort(np.abs(np.random.randn(*size)))
         model = self.build_model(
             pm.Exponential,
             {"lam": lam},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=0)
@@ -452,12 +452,12 @@ def test_exponential_ordered(self, lam, size):
         ],
     )
     def test_beta_ordered(self, a, b, size):
-        testval = np.sort(np.abs(np.random.rand(*size)))
+        initval = np.sort(np.abs(np.random.rand(*size)))
         model = self.build_model(
             pm.Beta,
             {"alpha": a, "beta": b},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=tr.Chain([tr.logodds, tr.ordered]),
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=0)
@@ -475,24 +475,24 @@ def transform_params(rv_var):
 
         interval = tr.Interval(transform_params)
 
-        testval = np.sort(np.abs(np.random.rand(*size)))
+        initval = np.sort(np.abs(np.random.rand(*size)))
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=tr.Chain([interval, tr.ordered]),
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)
 
     @pytest.mark.parametrize("mu,kappa,size", [(0.0, 1.0, (2,)), (np.zeros(3), np.ones(3), (4, 3))])
     def test_vonmises_ordered(self, mu, kappa, size):
-        testval = np.sort(np.abs(np.random.rand(*size)))
+        initval = np.sort(np.abs(np.random.rand(*size)))
         model = self.build_model(
             pm.VonMises,
             {"mu": mu, "kappa": kappa},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=tr.Chain([tr.circular, tr.ordered]),
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=0)
@@ -506,12 +506,12 @@ def test_vonmises_ordered(self, mu, kappa, size):
         ],
     )
     def test_uniform_other(self, lower, upper, size, transform):
-        testval = np.ones(size) / size[-1]
+        initval = np.ones(size) / size[-1]
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
             size=size,
-            testval=testval,
+            initval=initval,
             transform=transform,
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)
@@ -524,9 +524,9 @@ def test_uniform_other(self, lower, upper, size, transform):
         ],
     )
     def test_mvnormal_ordered(self, mu, cov, size, shape):
-        testval = np.sort(np.random.randn(*shape))
+        initval = np.sort(np.random.randn(*shape))
         model = self.build_model(
-            pm.MvNormal, {"mu": mu, "cov": cov}, size=size, testval=testval, transform=tr.ordered
+            pm.MvNormal, {"mu": mu, "cov": cov}, size=size, initval=initval, transform=tr.ordered
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)
 
diff --git a/pymc3/tests/test_types.py b/pymc3/tests/test_types.py
index c38c04edf8..7bfd260664 100644
--- a/pymc3/tests/test_types.py
+++ b/pymc3/tests/test_types.py
@@ -37,7 +37,7 @@ def teardown_method(self):
     @aesara.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
     def test_float64(self):
         with Model() as model:
-            x = Normal("x", testval=np.array(1.0, dtype="float64"))
+            x = Normal("x", initval=np.array(1.0, dtype="float64"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=np.random.randn(5))
 
         assert x.dtype == "float64"
@@ -50,7 +50,7 @@ def test_float64(self):
     @aesara.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
     def test_float32(self):
         with Model() as model:
-            x = Normal("x", testval=np.array(1.0, dtype="float32"))
+            x = Normal("x", initval=np.array(1.0, dtype="float32"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=np.random.randn(5).astype("float32"))
 
         assert x.dtype == "float32"
@@ -65,11 +65,11 @@ def test_float64_MLDA(self):
         data = np.random.randn(5)
 
         with Model() as coarse_model:
-            x = Normal("x", testval=np.array(1.0, dtype="float64"))
+            x = Normal("x", initval=np.array(1.0, dtype="float64"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=data + 0.5)
 
         with Model() as model:
-            x = Normal("x", testval=np.array(1.0, dtype="float64"))
+            x = Normal("x", initval=np.array(1.0, dtype="float64"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=data)
 
         assert x.dtype == "float64"
@@ -83,11 +83,11 @@ def test_float32_MLDA(self):
         data = np.random.randn(5).astype("float32")
 
         with Model() as coarse_model:
-            x = Normal("x", testval=np.array(1.0, dtype="float32"))
+            x = Normal("x", initval=np.array(1.0, dtype="float32"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=data + 0.5)
 
         with Model() as model:
-            x = Normal("x", testval=np.array(1.0, dtype="float32"))
+            x = Normal("x", initval=np.array(1.0, dtype="float32"))
             obs = Normal("obs", mu=x, sigma=1.0, observed=data)
 
         assert x.dtype == "float32"
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index b083e57870..a4a470dfe0 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -655,7 +655,7 @@ def simple_model_data(use_minibatch):
 def simple_model(simple_model_data):
     with pm.Model() as model:
         mu_ = pm.Normal(
-            "mu", mu=simple_model_data["mu0"], sigma=simple_model_data["sigma0"], testval=0
+            "mu", mu=simple_model_data["mu0"], sigma=simple_model_data["sigma0"], initval=0
         )
         pm.Normal(
             "x",