diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
index 057a109..ed38ce5 100644
--- a/.github/workflows/python-tests.yml
+++ b/.github/workflows/python-tests.yml
@@ -48,7 +48,6 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        pymc3-version: [stable, dev]
 
     steps:
     - uses: actions/checkout@v2
@@ -61,7 +60,6 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        if [[ "${{ matrix.pymc3-version }}" != "stable" ]]; then pip install "pymc3 @ git+https://github.com/pymc-devs/pymc3.git@master"; fi
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Test with pytest
       run: |
diff --git a/pymc3_hmm/distributions.py b/pymc3_hmm/distributions.py
index 4554beb..7fd360e 100644
--- a/pymc3_hmm/distributions.py
+++ b/pymc3_hmm/distributions.py
@@ -1,108 +1,213 @@
-import warnings
+from copy import copy
+from typing import Sequence
 
+import aesara
+import aesara.tensor as at
 import numpy as np
+import pymc3 as pm
+from aesara.compile.builders import OpFromGraph
+from aesara.graph.basic import Constant
+from aesara.graph.fg import FunctionGraph
+from aesara.graph.op import compute_test_value
+from aesara.graph.opt import local_optimizer, pre_greedy_local_optimizer
+from aesara.scalar import upcast
+from aesara.tensor.basic import make_vector
+from aesara.tensor.extra_ops import BroadcastTo
+from aesara.tensor.random.basic import categorical
+from aesara.tensor.random.op import RandomVariable
+from aesara.tensor.random.opt import (
+    local_dimshuffle_rv_lift,
+    local_rv_size_lift,
+    local_subtensor_rv_lift,
+)
+from aesara.tensor.random.utils import broadcast_params, normalize_size_param
+from aesara.tensor.type_other import NoneConst
+from aesara.tensor.var import TensorVariable
+from pymc3.aesaraf import change_rv_size
+from pymc3.distributions.logprob import _logp, logpt
+
+
+@local_optimizer([BroadcastTo])
+def naive_bcast_rv_lift(fgraph, node):
+    """Lift a ``BroadcastTo`` through a ``RandomVariable`` ``Op``.
+
+    XXX: This implementation simply broadcasts the ``RandomVariable``'s
+    parameters, which won't always work (e.g. multivariate distributions).
+
+    TODO: Instead, it should use ``RandomVariable.ndim_supp``--and the like--to
+    determine which dimensions of each parameter need to be broadcasted.
+    Also, this doesn't need to remove ``size`` to perform the lifting, like it
+    currently does.
+    """
 
-try:  # pragma: no cover
-    import aesara
-    import aesara.tensor as at
-    from aesara.graph.op import get_test_value
-    from aesara.graph.utils import TestValueError
-    from aesara.scalar import upcast
-    from aesara.tensor.extra_ops import broadcast_to as at_broadcast_to
-except ImportError:  # pragma: no cover
-    import theano as aesara
-    import theano.tensor as at
-    from theano.graph.op import get_test_value
-    from theano.graph.utils import TestValueError
-    from theano.scalar import upcast
-    from theano.tensor.extra_ops import broadcast_to as at_broadcast_to
+    if not (
+        isinstance(node.op, BroadcastTo)
+        and node.inputs[0].owner
+        and isinstance(node.inputs[0].owner.op, RandomVariable)
+    ):
+        return
 
-import pymc3 as pm
-from pymc3.distributions.distribution import _DrawValuesContext, draw_values
-from pymc3.distributions.mixture import _conversion_map, all_discrete
-
-from pymc3_hmm.utils import tt_broadcast_arrays, tt_expand_dims, vsearchsorted
-
-
-def distribution_subset_args(dist, shape, idx, point=None):
-    """Obtain subsets of a distribution parameters via indexing.
-
-    This is used to effectively "lift" slices/`Subtensor` `Op`s up to a
-    distribution's parameters.  In other words, `pm.Normal(mu, sigma)[idx]`
-    becomes `pm.Normal(mu[idx], sigma[idx])`.  In computations, the former
-    requires the entire evaluation of `pm.Normal(mu, sigma)` (e.g. its `.logp`
-    or a sample from `.random`), which could be very complex, while the latter
-    only evaluates the subset of interest.
-
-    XXX: this lifting isn't appropriate for every distribution.  It's fine for
-    most scalar distributions and even some multivariate distributions, but
-    some required functionality is missing in order to handle even the latter.
-
-    Parameters
-    ----------
-    dist : Distribution
-        The distribution object with the parameters to be indexed.
-    shape : tuple or Shape
-        The shape of the distribution's output/support.  This is used
-        to (naively) determine the parameters' broadcasting pattern.
-    idx : ndarray or TensorVariable
-        The indices applied to the parameters of `dist`.
-    point : dict (optional)
-        A dictionary keyed on the `str` names of each parameter in `dist`,
-        which are mapped to NumPy values for the corresponding parameter.  When
-        this is given, the Theano parameters are replaced by their values in the
-        dictionary.
-
-    Returns
-    -------
-    res: list
-        An ordered set of broadcasted and indexed parameters for `dist`.
+    bcast_shape = node.inputs[1:]
 
+    assert len(bcast_shape) > 0
 
-    """
+    rv_var = node.inputs[0]
+    rv_node = rv_var.owner
 
-    dist_param_names = dist._distr_parameters_for_repr()
+    if hasattr(fgraph, "dont_touch_vars") and rv_var in fgraph.dont_touch_vars:
+        return
 
-    if point:
-        # Try to get a concrete/NumPy value if a `point` parameter was
-        # given.
-        try:
-            idx = get_test_value(idx)
-        except TestValueError:  # pragma: no cover
-            pass
+    size_lift_res = local_rv_size_lift.transform(fgraph, rv_node)
+    if size_lift_res is None:
+        lifted_node = rv_node
+    else:
+        _, lifted_rv = size_lift_res
+        lifted_node = lifted_rv.owner
 
-    res = []
-    for param in dist_param_names:
+    rng, size, dtype, *dist_params = lifted_node.inputs
 
-        # Use the (sampled) point, if present
-        if point is None or param not in point:
-            x = getattr(dist, param, None)
+    new_dist_params = [at.broadcast_to(param, bcast_shape) for param in dist_params]
+    bcasted_node = lifted_node.op.make_node(rng, size, dtype, *new_dist_params)
 
-            if x is None:
-                continue
-        else:
-            x = point[param]
+    if aesara.config.compute_test_value != "off":
+        compute_test_value(bcasted_node)
 
-        bcast_res = at_broadcast_to(x, shape)
+    return [bcasted_node.outputs[1]]
 
-        res.append(bcast_res[idx])
 
-    return res
+def rv_pull_down(x: TensorVariable, dont_touch_vars=None) -> TensorVariable:
+    """Pull a ``RandomVariable`` ``Op`` down through a graph, when possible."""
+    if dont_touch_vars is None:
+        dont_touch_vars = []
+
+    fgraph = FunctionGraph(outputs=dont_touch_vars, clone=False)
 
+    return pre_greedy_local_optimizer(
+        fgraph,
+        [
+            local_dimshuffle_rv_lift,
+            local_subtensor_rv_lift,
+            naive_bcast_rv_lift,
+        ],
+        x,
+    )
 
-def get_and_check_comp_value(x):
-    if isinstance(x, pm.Distribution):
-        try:
-            return x.default()
-        except AttributeError:
-            pass
 
-        return x.random()
+def non_constant(x):
+    x = at.as_tensor_variable(x)
+    if isinstance(x, Constant):
+        # XXX: This isn't good for `size` parameters, because it could result
+        # in `at.get_vector_length` exceptions.
+        res = x.type()
+        res.tag = copy(res.tag)
+        if aesara.config.compute_test_value != "off":
+            res.tag.test_value = x.data
+        res.name = x.name
+        return res
     else:
-        raise TypeError(
-            "Component distributions must be PyMC3 Distributions. "
-            "Got {}".format(type(x))
+        return x
+
+
+class SwitchingProcessFactory(OpFromGraph):
+    default_output = 1
+    # FIXME: This is just to appease `random_make_inplace`
+    inplace = True
+
+    def make_node(self, *inputs):
+        # Make the `make_node` signature consistent with the node inputs
+        # TODO: This is a hack; make it less so.
+        num_expected_inps = len(self.local_inputs) - len(self.shared_inputs)
+        if len(inputs) > num_expected_inps:
+            inputs = inputs[:num_expected_inps]
+        return super().make_node(*inputs)
+
+
+# Allow `SwitchingProcessFactory`s to be typed as `RandomVariable`s
+RandomVariable.register(SwitchingProcessFactory)
+
+
+def create_switching_process_op(size, states, comp_rvs, output_shape=None):
+    # We use `make_vector` to preserve the known/fixed-length of our
+    # `size` parameter.  If we made this a simple `at.vector`, some
+    # shape-related steps in `RandomVariable` would unnecessarily fail.
+    size_param = make_vector(
+        *[non_constant(size[i]) for i in range(at.get_vector_length(size))]
+    )
+    size_param.name = "size"
+
+    # We need to make a copy of the state sequence, because we don't want
+    # or need anything above this part of the graph.
+    states_param = states.type()
+    states_param.name = states.name
+
+    # TODO: We should create shallow copies of the component distributions, as
+    # well.  In other words, the inputs to the `Op` we're constructing should
+    # be the inputs to these component distributions.
+    comp_rv_params = [non_constant(rv) for rv in comp_rvs]
+
+    dtype = upcast(*[rv.type.dtype for rv in comp_rv_params])
+    comp_ndim_supp = comp_rv_params[0].owner.op.ndim_supp
+
+    resized_states_param = at.broadcast_to(
+        states_param, tuple(size_param) + tuple(states_param.shape)
+    )
+
+    def resize_rv(x, size):
+        if at.get_vector_length(size):
+            return change_rv_size(x, size, expand=True)
+        else:
+            return x
+
+    resized_comp_rvs = [
+        # XXX: This will create new component distributions that are
+        # disconnected from the originals!  In other words,
+        # any reference to the old ones will be invalidated.
+        resize_rv(
+            rv_pull_down(at.atleast_1d(comp_rv), comp_rv.owner.inputs), size_param
         )
+        for comp_rv in comp_rv_params
+    ]
+
+    bcast_states, *bcast_comp_rvs = broadcast_params(
+        (resized_states_param,) + tuple(resized_comp_rvs),
+        (0,) + (comp_ndim_supp,) * len(resized_comp_rvs),
+    )
+
+    if output_shape is not None:
+        if comp_ndim_supp > 0 and at.get_vector_length(output_shape) > comp_ndim_supp:
+            bcast_states = at.broadcast_to(
+                bcast_states, tuple(output_shape[:-comp_ndim_supp])
+            )
+        bcast_comp_rvs = [at.broadcast_to(rv, output_shape) for rv in bcast_comp_rvs]
+    else:
+        output_shape = bcast_comp_rvs[0].shape
+
+    assert at.get_vector_length(output_shape) > 0
+
+    res = at.empty(output_shape, dtype=dtype)
+
+    for i, bcasted_comp_rv in enumerate(bcast_comp_rvs):
+        i_idx = at.nonzero(at.eq(bcast_states, i))
+        indexed_comp_rv = bcasted_comp_rv[i_idx]
+
+        lifted_comp_rv = rv_pull_down(indexed_comp_rv, bcasted_comp_rv.owner.inputs)
+
+        res = at.set_subtensor(res[i_idx], lifted_comp_rv)
+
+    new_op = SwitchingProcessFactory(
+        # The first and third parameters are simply placeholders so that the
+        # arguments signature matches `RandomVariable`'s
+        [at.iscalar(), size_param, at.iscalar(), states_param] + list(comp_rv_params),
+        [NoneConst, res],
+        inline=True,
+        on_unused_input="ignore",
+    )
+
+    # Add `RandomVariable`-like "metadata"
+    new_op.ndim_supp = comp_ndim_supp + 1
+    new_op.ndims_params = (1,) + tuple(comp.ndim for comp in comp_rv_params)
+
+    return new_op
 
 
 class SwitchingProcess(pm.Distribution):
@@ -112,149 +217,94 @@ class SwitchingProcess(pm.Distribution):
 
     """  # noqa: E501
 
-    def __init__(self, comp_dists, states, *args, **kwargs):
+    def __new__(cls, *args, **kwargs):
+        obs = kwargs.get("observed", None)
+
+        if obs is not None:
+            # XXX: This is a nasty hack to allow the "size changes" PyMC3
+            # performs on observed `RandomVariable`s.
+            kwargs["out_shape"] = tuple(obs.shape)
+
+        return super().__new__(cls, *args, **kwargs)
+
+    @classmethod
+    def dist(
+        cls,
+        comp_rvs: Sequence[TensorVariable],
+        states: TensorVariable,
+        *args,
+        size=None,
+        out_shape=None,
+        rng=None,
+        **kwargs
+    ):
         """Initialize a `SwitchingProcess` instance.
 
-        Each `Distribution` object in `comp_dists` must have a
-        `Distribution.random_subset` method that takes a list of indices and
-        returns a sample for only that subset.  Unfortunately, since PyMC3
-        doesn't provide such a method, you'll have to implement it yourself and
-        monkey patch a `Distribution` class.
-
         Parameters
         ----------
-        comp_dists : list of Distribution
-            A list containing `Distribution` objects for each mixture component.
-            These are essentially the emissions distributions.
-        states : DiscreteMarkovChain
+        comp_rvs:
+            A list containing `RandomVariable` objects for each mixture component.
+        states:
             The hidden state sequence.  It should have a number of states
             equal to the size of `comp_dists`.
 
         """
-        self.states = at.as_tensor_variable(pm.intX(states))
 
-        if len(comp_dists) > 31:
-            warnings.warn(
-                "There are too many mixture distributions to properly"
-                " determine their combined shape."
-            )
+        size = normalize_size_param(size)
 
-        self.comp_dists = comp_dists
+        out_shape = kwargs.pop("out_shape", None)
 
-        states_tv = get_test_value(self.states)
-        bcast_comps = np.broadcast(
-            states_tv, *[get_and_check_comp_value(x) for x in comp_dists[:31]]
-        )
-        shape = bcast_comps.shape
+        states = at.as_tensor(states)
 
-        defaults = kwargs.pop("defaults", [])
+        new_comp_rvs = []
+        for rv in comp_rvs:
+            new_rv = at.as_tensor(rv)
+            new_rv.tag.value_var = new_rv.type()
+            new_comp_rvs.append(new_rv)
 
-        out_dtype = upcast(*[x.type.dtype for x in comp_dists])
-        dtype = kwargs.pop("dtype", out_dtype)
+        # TODO: Make sure `comp_rvs` are not in the/a model.
+        # This will help reduce any rewrite inconsistencies.
+        SwitchingProcessOp = create_switching_process_op(
+            size,
+            states,
+            new_comp_rvs,
+            output_shape=out_shape,
+        )
 
-        if not all_discrete(comp_dists):
-            try:
-                bcast_means = tt_broadcast_arrays(
-                    *([self.states] + [d.mean.astype(dtype) for d in self.comp_dists])
-                )
-                self.mean = at.choose(self.states, bcast_means[1:])
+        rv_var = SwitchingProcessOp(*([0, size, 0, states] + list(new_comp_rvs)))
 
-                if "mean" not in defaults:
-                    defaults.append("mean")
+        testval = kwargs.pop("testval", None)
 
-            except (AttributeError, ValueError, IndexError):  # pragma: no cover
-                pass
+        if testval is not None:
+            rv_var.tag.test_value = testval
 
-        try:
-            bcast_modes = tt_broadcast_arrays(
-                *([self.states] + [d.mode.astype(dtype) for d in self.comp_dists])
-            )
-            self.mode = at.choose(self.states, bcast_modes[1:])
+        return rv_var
 
-            if "mode" not in defaults:
-                defaults.append("mode")
 
-        except (AttributeError, ValueError, IndexError):  # pragma: no cover
-            pass
+@_logp.register(SwitchingProcessFactory)
+def switching_process_logp(op, var, rvs_to_values, *dist_params, **kwargs):
+    obs = rvs_to_values.get(var, var)
+    states, *comp_rvs = dist_params[: len(op.inputs[3:])]
 
-        super().__init__(shape=shape, dtype=dtype, defaults=defaults, **kwargs)
+    obs_tt = at.as_tensor_variable(obs)
 
-    def logp(self, obs):
-        """Return the scalar Theano log-likelihood at a point."""
+    logp_val = at.alloc(-np.inf, *tuple(obs_tt.shape))
 
-        obs_tt = at.as_tensor_variable(obs)
+    for i, comp_rv in enumerate(comp_rvs):
+        i_idx = at.nonzero(at.eq(states, i))
+        obs_i = obs_tt[i_idx]
 
-        logp_val = at.alloc(-np.inf, *obs.shape)
+        bcasted_comp_rv = at.broadcast_to(comp_rv, obs_tt.shape)
+        indexed_comp_rv = bcasted_comp_rv[i_idx]
 
-        for i, dist in enumerate(self.comp_dists):
-            i_mask = at.eq(self.states, i)
-            obs_i = obs_tt[i_mask]
-            subset_dist = dist.dist(*distribution_subset_args(dist, obs.shape, i_mask))
-            logp_val = at.set_subtensor(logp_val[i_mask], subset_dist.logp(obs_i))
+        lifted_comp_rv = rv_pull_down(indexed_comp_rv, comp_rv.owner.inputs)
 
-        return logp_val
+        logp_val = at.set_subtensor(logp_val[i_idx], logpt(lifted_comp_rv, obs_i))
 
-    def random(self, point=None, size=None):
-        """Sample from this distribution conditional on a given set of values.
+    if kwargs.get("sum", False):
+        logp_val = logp_val.sum()
 
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        with _DrawValuesContext():
-            (states,) = draw_values([self.states], point=point, size=size)
-
-        # This is a terrible thing to have to do here, but it's better than
-        # having to (know to) update `Distribution.shape` when/if dimensions
-        # change (e.g. when sampling new state sequences).
-        bcast_comps = np.broadcast(
-            states, *[dist.random(point=point) for dist in self.comp_dists]
-        )
-        self_shape = bcast_comps.shape
-
-        if size:
-            # `draw_values` will not honor the `size` parameter if its arguments
-            # don't contain random variables, so, when our `self.states` are
-            # constants, we have to broadcast `states` so that it matches `size +
-            # self.shape`.
-            expanded_states = np.broadcast_to(
-                states, tuple(np.atleast_1d(size)) + self_shape
-            )
-        else:
-            expanded_states = np.broadcast_to(states, self_shape)
-
-        samples = np.empty(expanded_states.shape)
-
-        for i, dist in enumerate(self.comp_dists):
-            # We want to sample from only the parts of our component
-            # distributions that are active given the states.
-            # This is only really relevant when the component distributions
-            # change over the state space (e.g. Poisson means that change
-            # over time).
-            # We could always sample such components over the entire space
-            # (e.g. time), but, for spaces with large dimension, that would
-            # be extremely costly and wasteful.
-            i_idx = np.where(expanded_states == i)
-            i_size = len(i_idx[0])
-            if i_size > 0:
-                subset_args = distribution_subset_args(
-                    dist, expanded_states.shape, i_idx, point=point
-                )
-                state_dist = dist.dist(*subset_args)
-
-                sample = state_dist.random(point=point)
-                samples[i_idx] = sample
-
-        return samples
+    return logp_val
 
 
 class PoissonZeroProcess(SwitchingProcess):
@@ -264,7 +314,8 @@ class PoissonZeroProcess(SwitchingProcess):
     the second mixture component is the Poisson random variable.
     """
 
-    def __init__(self, mu=None, states=None, **kwargs):
+    @classmethod
+    def dist(cls, mu=None, states=None, rng=None, **kwargs):
         """Initialize a `PoissonZeroProcess` object.
 
         Parameters
@@ -275,13 +326,93 @@ def __init__(self, mu=None, states=None, **kwargs):
             A vector of integer 0-1 states that indicate which component of
             the mixture is active at each point/time.
         """
-        self.mu = at.as_tensor_variable(pm.floatX(mu))
-        self.states = at.as_tensor_variable(states)
+        mu = at.as_tensor_variable(mu)
+        states = at.as_tensor_variable(states)
+
+        # NOTE: This creates distributions that are *not* part of a `Model`
+        return super().dist(
+            [pm.Constant.dist(0), pm.Poisson.dist(mu, rng=rng)],
+            states,
+            rng=rng,
+            **kwargs
+        )
+
+
+class DiscreteMarkovChainFactory(OpFromGraph):
+    # Add `RandomVariable`-like "metadata"
+    ndim_supp = 1
+    ndims_params = (3, 1)
+    default_output = 1
+    # FIXME: This is just to appease `random_make_inplace`
+    inplace = True
+
 
-        super().__init__([pm.Constant.dist(0), pm.Poisson.dist(mu)], states, **kwargs)
+# Allow `DiscreteMarkovChainFactory`s to be typed as `RandomVariable`s
+RandomVariable.register(DiscreteMarkovChainFactory)
 
 
-class DiscreteMarkovChain(pm.Discrete):
+def create_discrete_mc_op(rng, size, Gammas, gamma_0):
+
+    # Again, we need to preserve the length of this symbolic vector, so we do
+    # this.
+    size_param = make_vector(
+        *[non_constant(size[i]) for i in range(at.get_vector_length(size))]
+    )
+    size_param.name = "size"
+
+    # We make shallow copies so that unwanted ancestors don't appear in the
+    # graph.
+    Gammas_param = non_constant(Gammas).type()
+    Gammas_param.name = "Gammas_param"
+
+    gamma_0_param = non_constant(gamma_0).type()
+    gamma_0_param.name = "gamma_0_param"
+
+    bcast_Gammas_param, bcast_gamma_0_param = broadcast_params(
+        (Gammas_param, gamma_0_param), (3, 1)
+    )
+
+    # Sample state 0 in each state sequence
+    state_0 = categorical(
+        bcast_gamma_0_param,
+        size=tuple(size_param) + tuple(bcast_gamma_0_param.shape[:-1]),
+        # size=at.join(0, size_param, bcast_gamma_0_param.shape[:-1]),
+        rng=rng,
+    )
+
+    N = bcast_Gammas_param.shape[-3]
+    states_shape = tuple(state_0.shape) + (N,)
+
+    bcast_Gammas_param = at.broadcast_to(
+        bcast_Gammas_param, states_shape + tuple(bcast_Gammas_param.shape[-2:])
+    )
+
+    def loop_fn(n, state_nm1, Gammas_inner, rng):
+        gamma_t = Gammas_inner[..., n, :, :]
+        idx = tuple(at.ogrid[[slice(None, d) for d in tuple(state_0.shape)]]) + (
+            state_nm1.T,
+        )
+        gamma_t = gamma_t[idx]
+        state_n = categorical(gamma_t, rng=rng)
+        return state_n.T
+
+    res, _ = aesara.scan(
+        loop_fn,
+        outputs_info=[{"initial": state_0.T, "taps": [-1]}],
+        sequences=[at.arange(N)],
+        non_sequences=[bcast_Gammas_param, rng],
+        # strict=True,
+    )
+
+    return DiscreteMarkovChainFactory(
+        [at.iscalar(), size_param, at.iscalar(), Gammas_param, gamma_0_param],
+        [rng, res.T],
+        inline=False,
+        on_unused_input="ignore",
+    )
+
+
+class DiscreteMarkovChain(pm.Distribution):
     """A first-order discrete Markov chain distribution.
 
     This class characterizes vector random variables consisting of state
@@ -290,7 +421,8 @@ class DiscreteMarkovChain(pm.Discrete):
 
     """
 
-    def __init__(self, Gammas, gamma_0, shape, **kwargs):
+    @classmethod
+    def dist(cls, Gammas, gamma_0, size=None, rng=None, **kwargs):
         """Initialize an `DiscreteMarkovChain` object.
 
         Parameters
@@ -304,145 +436,106 @@ def __init__(self, Gammas, gamma_0, shape, **kwargs):
         gamma_0: TensorVariable
             The initial state probabilities.  The last dimension should be length `M`,
             i.e. the number of distinct states.
-        shape: tuple of int
-            Shape of the state sequence.  The last dimension is `N`, i.e. the
-            length of the state sequence(s).
         """
-        self.gamma_0 = at.as_tensor_variable(pm.floatX(gamma_0))
+        gamma_0 = at.as_tensor_variable(pm.floatX(gamma_0))
 
         assert Gammas.ndim >= 3
 
-        self.Gammas = at.as_tensor_variable(pm.floatX(Gammas))
+        Gammas = at.as_tensor_variable(pm.floatX(Gammas))
 
-        shape = np.atleast_1d(shape)
+        size = normalize_size_param(size)
 
-        dtype = _conversion_map[aesara.config.floatX]
-        self.mode = np.zeros(tuple(shape), dtype=dtype)
+        if rng is None:
+            rng = aesara.shared(np.random.RandomState(), borrow=True)
 
-        super().__init__(shape=shape, **kwargs)
+        # rv_var = create_discrete_mc_op(size, Gammas, gamma_0)
+        DiscreteMarkovChainOp = create_discrete_mc_op(rng, size, Gammas, gamma_0)
+        rv_var = DiscreteMarkovChainOp(0, size, 0, Gammas, gamma_0)
 
-    def logp(self, states):
-        r"""Create a Theano graph that computes the log-likelihood for a discrete Markov chain.
+        testval = kwargs.pop("testval", None)
 
-        This is the log-likelihood for the joint distribution of states, :math:`S_t`, conditional
-        on state samples, :math:`s_t`, given by the following:
+        if testval is not None:
+            rv_var.tag.test_value = testval
 
-        .. math::
+        return rv_var
 
-            \int_{S_0} P(S_1 = s_1 \mid S_0) dP(S_0) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
 
-        The first term (i.e. the integral) simply computes the marginal :math:`P(S_1 = s_1)`, so
-        another way to express this result is as follows:
+@_logp.register(DiscreteMarkovChainFactory)
+def discrete_mc_logp(op, var, rvs_to_values, *dist_params, **kwargs):
+    r"""Create a Aesara graph that computes the log-likelihood for a discrete Markov chain.
 
-        .. math::
+    This is the log-likelihood for the joint distribution of states, :math:`S_t`, conditional
+    on state samples, :math:`s_t`, given by the following:
 
-            P(S_1 = s_1) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
+    .. math::
 
-        """  # noqa: E501
+        \int_{S_0} P(S_1 = s_1 \mid S_0) dP(S_0) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
 
-        Gammas = at.shape_padleft(self.Gammas, states.ndim - (self.Gammas.ndim - 2))
+    The first term (i.e. the integral) simply computes the marginal :math:`P(S_1 = s_1)`, so
+    another way to express this result is as follows:
 
-        # Multiply the initial state probabilities by the first transition
-        # matrix by to get the marginal probability for state `S_1`.
-        # The integral that produces the marginal is essentially
-        # `gamma_0.dot(Gammas[0])`
-        Gamma_1 = Gammas[..., 0:1, :, :]
-        gamma_0 = tt_expand_dims(self.gamma_0, (-3, -1))
-        P_S_1 = at.sum(gamma_0 * Gamma_1, axis=-2)
+    .. math::
 
-        # The `tt.switch`s allow us to broadcast the indexing operation when
-        # the replication dimensions of `states` and `Gammas` don't match
-        # (e.g. `states.shape[0] > Gammas.shape[0]`)
-        S_1_slices = tuple(
-            slice(
-                at.switch(at.eq(P_S_1.shape[i], 1), 0, 0),
-                at.switch(at.eq(P_S_1.shape[i], 1), 1, d),
-            )
-            for i, d in enumerate(states.shape)
-        )
-        S_1_slices = (tuple(at.ogrid[S_1_slices]) if S_1_slices else tuple()) + (
-            states[..., 0:1],
-        )
-        logp_S_1 = at.log(P_S_1[S_1_slices]).sum(axis=-1)
-
-        # These are slices for the extra dimensions--including the state
-        # sequence dimension (e.g. "time")--along which which we need to index
-        # the transition matrix rows using the "observed" `states`.
-        trans_slices = tuple(
-            slice(
-                at.switch(
-                    at.eq(Gammas.shape[i], 1), 0, 1 if i == states.ndim - 1 else 0
-                ),
-                at.switch(at.eq(Gammas.shape[i], 1), 1, d),
-            )
-            for i, d in enumerate(states.shape)
-        )
-        trans_slices = (tuple(at.ogrid[trans_slices]) if trans_slices else tuple()) + (
-            states[..., :-1],
-        )
+        P(S_1 = s_1) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
 
-        # Select the transition matrix row of each observed state; this yields
-        # `P(S_t | S_{t-1} = s_{t-1})`
-        P_S_2T = Gammas[trans_slices]
+    """  # noqa: E501
 
-        obs_slices = tuple(slice(None, d) for d in P_S_2T.shape[:-1])
-        obs_slices = (tuple(at.ogrid[obs_slices]) if obs_slices else tuple()) + (
-            states[..., 1:],
+    states = rvs_to_values.get(var, var)
+    Gammas, gamma_0 = dist_params[: len(op.inputs[3:])]
+
+    Gammas = at.shape_padleft(Gammas, states.ndim - (Gammas.ndim - 2))
+
+    # Multiply the initial state probabilities by the first transition
+    # matrix by to get the marginal probability for state `S_1`.
+    # The integral that produces the marginal is essentially
+    # `gamma_0.dot(Gammas[0])`
+    Gamma_1 = Gammas[..., 0:1, :, :]
+    gamma_0 = at.expand_dims(gamma_0, (-3, -1))
+    P_S_1 = at.sum(gamma_0 * Gamma_1, axis=-2)
+
+    # The `tt.switch`s allow us to broadcast the indexing operation when
+    # the replication dimensions of `states` and `Gammas` don't match
+    # (e.g. `states.shape[0] > Gammas.shape[0]`)
+    S_1_slices = tuple(
+        slice(
+            at.switch(at.eq(P_S_1.shape[i], 1), 0, 0),
+            at.switch(at.eq(P_S_1.shape[i], 1), 1, d),
         )
-        logp_S_1T = at.log(P_S_2T[obs_slices])
-
-        res = logp_S_1 + at.sum(logp_S_1T, axis=-1)
-        res.name = "DiscreteMarkovChain_logp"
-
-        return res
-
-    def random(self, point=None, size=None):
-        """Sample from a discrete Markov chain conditional on a given set of values.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        terms = [self.gamma_0, self.Gammas]
-
-        with _DrawValuesContext():
-            gamma_0, Gamma = draw_values(terms, point=point)
-
-        # Sample state 0 in each state sequence
-        state_n = pm.Categorical.dist(gamma_0, shape=self.shape[:-1]).random(
-            point=point, size=size
+        for i, d in enumerate(states.shape)
+    )
+    S_1_slices = (tuple(at.ogrid[S_1_slices]) if S_1_slices else tuple()) + (
+        states[..., 0:1],
+    )
+    logp_S_1 = at.log(P_S_1[S_1_slices]).sum(axis=-1)
+
+    # These are slices for the extra dimensions--including the state
+    # sequence dimension (e.g. "time")--along which which we need to index
+    # the transition matrix rows using the "observed" `states`.
+    trans_slices = tuple(
+        slice(
+            at.switch(at.eq(Gammas.shape[i], 1), 0, 1 if i == states.ndim - 1 else 0),
+            at.switch(at.eq(Gammas.shape[i], 1), 1, d),
         )
-        state_shape = state_n.shape
-
-        N = self.shape[-1]
-
-        states = np.empty(state_shape + (N,), dtype=self.dtype)
+        for i, d in enumerate(states.shape)
+    )
+    trans_slices = (tuple(at.ogrid[trans_slices]) if trans_slices else tuple()) + (
+        states[..., :-1],
+    )
 
-        unif_samples = np.random.uniform(size=states.shape)
+    # Select the transition matrix row of each observed state; this yields
+    # `P(S_t | S_{t-1} = s_{t-1})`
+    P_S_2T = Gammas[trans_slices]
 
-        # Make sure we have a transition matrix for each element in a state
-        # sequence
-        Gamma = np.broadcast_to(Gamma, tuple(states.shape) + Gamma.shape[-2:])
+    obs_slices = tuple(slice(None, d) for d in P_S_2T.shape[:-1])
+    obs_slices = (tuple(at.ogrid[obs_slices]) if obs_slices else tuple()) + (
+        states[..., 1:],
+    )
+    logp_S_1T = at.log(P_S_2T[obs_slices])
 
-        # Slices across each independent/replication dimension
-        slices_tuple = tuple(np.ogrid[[slice(None, d) for d in state_shape]])
+    res = logp_S_1 + at.sum(logp_S_1T, axis=-1)
+    res.name = "DiscreteMarkovChain_logp"
 
-        for n in range(0, N):
-            gamma_t = Gamma[..., n, :, :]
-            gamma_t = gamma_t[slices_tuple + (state_n,)]
-            state_n = vsearchsorted(gamma_t.cumsum(axis=-1), unif_samples[..., n])
-            states[..., n] = state_n
+    if kwargs.get("sum", False):
+        res = res.sum()
 
-        return states
-
-    def _distr_parameters_for_repr(self):
-        return ["Gammas", "gamma_0"]
+    return res
diff --git a/pymc3_hmm/step_methods.py b/pymc3_hmm/step_methods.py
index d7c470c..c3f09c4 100644
--- a/pymc3_hmm/step_methods.py
+++ b/pymc3_hmm/step_methods.py
@@ -1,50 +1,66 @@
-from itertools import chain
+from typing import List
 
+import aesara.scalar as aes
+import aesara.tensor as at
 import numpy as np
-
-try:  # pragma: no cover
-    import aesara.scalar as aes
-    import aesara.tensor as at
-    from aesara.compile import optdb
-    from aesara.graph.basic import Variable, graph_inputs
-    from aesara.graph.fg import FunctionGraph
-    from aesara.graph.op import get_test_value as test_value
-    from aesara.graph.opt import OpRemove, pre_greedy_local_optimizer
-    from aesara.graph.optdb import Query
-    from aesara.tensor.elemwise import DimShuffle, Elemwise
-    from aesara.tensor.subtensor import AdvancedIncSubtensor1
-    from aesara.tensor.var import TensorConstant
-except ImportError:  # pragma: no cover
-    import theano.scalar as aes
-    import theano.tensor as at
-    from theano.compile import optdb
-    from theano.graph.basic import Variable, graph_inputs
-    from theano.graph.fg import FunctionGraph
-    from theano.graph.op import get_test_value as test_value
-    from theano.graph.opt import OpRemove, pre_greedy_local_optimizer
-    from theano.graph.optdb import Query
-    from theano.tensor.elemwise import DimShuffle, Elemwise
-    from theano.tensor.subtensor import AdvancedIncSubtensor1
-    from theano.tensor.var import TensorConstant
-
 import pymc3 as pm
-from pymc3.distributions.distribution import draw_values
+from aesara.compile import optdb
+from aesara.graph.basic import Variable, graph_inputs, vars_between, walk
+from aesara.graph.fg import FunctionGraph
+from aesara.graph.op import get_test_value as test_value
+from aesara.graph.opt import OpRemove, pre_greedy_local_optimizer
+from aesara.graph.optdb import Query
+from aesara.tensor.elemwise import DimShuffle, Elemwise
+from aesara.tensor.random.op import RandomVariable
+from aesara.tensor.random.type import RandomStateType
+from aesara.tensor.subtensor import AdvancedIncSubtensor1
+from aesara.tensor.var import TensorConstant
+from pymc3.aesaraf import change_rv_size
+from pymc3.distributions.logprob import logpt
 from pymc3.step_methods.arraystep import ArrayStep, BlockedStep, Competence
 from pymc3.util import get_untransformed_name
 
-from pymc3_hmm.distributions import DiscreteMarkovChain, SwitchingProcess
+from pymc3_hmm.distributions import DiscreteMarkovChainFactory, SwitchingProcessFactory
 from pymc3_hmm.utils import compute_trans_freqs
 
 big: float = 1e20
 small: float = 1.0 / big
 
 
+def walk_no_states(graphs, expand_fn=lambda x: x.owner.outputs):
+    def expand(var):
+        new_vars: List[Variable] = []
+
+        if var.owner and not isinstance(var.type, RandomStateType):
+            new_vars.extend(reversed(var.owner.inputs))
+
+        return new_vars
+
+    yield from walk(graphs, expand, False)
+
+
+def conform_rv_shape(rv_var, shape):
+    ndim_supp = rv_var.owner.op.ndim_supp
+    if ndim_supp > 0:
+        new_size = shape[:-ndim_supp]
+    else:
+        new_size = shape
+
+    rv_var = change_rv_size(rv_var, new_size)
+
+    if hasattr(rv_var.tag, "value_var"):
+        rv_var.tag.value_var = rv_var.type()
+
+    return rv_var
+
+
 def ffbs_step(
     gamma_0: np.ndarray,
     Gammas: np.ndarray,
     log_lik: np.ndarray,
     alphas: np.ndarray,
     out: np.ndarray,
+    rng: np.random.RandomState,
 ):
     """Sample a forward-filtered backward-sampled (FFBS) state sequence.
 
@@ -66,6 +82,8 @@ def ffbs_step(
     out
         An output array to be updated in-place with the posterior sample
         states.
+    rng
+        A ``RandomState`` object used to sample the states.
 
     """
     # Number of observations
@@ -105,7 +123,7 @@ def ffbs_step(
         alphas[..., n] = alpha_n
 
     # The uniform samples used to sample the categorical states
-    unif_samples: np.ndarray = np.random.uniform(size=out.shape)
+    unif_samples: np.ndarray = rng.uniform(size=out.shape)
 
     alpha_N: np.ndarray = alphas[..., N - 1]
     beta_N: np.ndarray = alpha_N / alpha_N.sum()
@@ -118,7 +136,7 @@ def ffbs_step(
 
     # Backward sampling
     for n in range(N - 2, -1, -1):
-        np.multiply(alphas[..., n], Gamma[n, :, state_np1], out=beta_n)
+        np.multiply(alphas[..., n], Gamma[n + 1, :, state_np1], out=beta_n)
         beta_n /= np.sum(beta_n)
 
         state_np1 = np.searchsorted(beta_n.cumsum(), unif_samples[n])
@@ -144,35 +162,57 @@ class FFBSStep(BlockedStep):
 
     name = "ffbs"
 
-    def __init__(self, vars, values=None, model=None):
+    def __init__(self, vars, values=None, model=None, rng=None):
 
         if len(vars) > 1:
             raise ValueError("This sampler only takes one variable.")
 
-        (var,) = pm.inputvars(vars)
+        (var,) = vars
 
-        if not isinstance(var.distribution, DiscreteMarkovChain):
+        if not var.owner or not isinstance(var.owner.op, DiscreteMarkovChainFactory):
             raise TypeError("This sampler only samples `DiscreteMarkovChain`s.")
 
         model = pm.modelcontext(model)
 
-        self.vars = [var]
+        self.vars = [model.rvs_to_values[var]]
+
+        if rng is None:
+            rng = np.random.RandomState()
+
+        self.rng = rng
+
+        def get_ancestors(v):
+            """This is a ``RandomVariable``-aware means of obtaining ancestors.
+
+            It won't walk a ``RandomVariable``'s RNG object.
+            """
+            if v.owner and isinstance(v.owner.op, RandomVariable):
+                return walk_no_states([v])
+            else:
+                return vars_between(list(graph_inputs([v])), [v])
 
         self.dependent_rvs = [
-            v
-            for v in model.basic_RVs
-            if v is not var and var in graph_inputs([v.logpt])
+            v for v in model.basic_RVs if v is not var and var in get_ancestors(v)
         ]
 
+        if not self.dependent_rvs:
+            raise ValueError(f"Could not find variables that depend on {var}")
+
         dep_comps_logp_stacked = []
         for i, dependent_rv in enumerate(self.dependent_rvs):
-            if isinstance(dependent_rv.distribution, SwitchingProcess):
+            if dependent_rv.owner and isinstance(
+                dependent_rv.owner.op, SwitchingProcessFactory
+            ):
                 comp_logps = []
 
                 # Get the log-likelihoood sequences for each state in this
                 # `SwitchingProcess` observations distribution
-                for comp_dist in dependent_rv.distribution.comp_dists:
-                    comp_logps.append(comp_dist.logp(dependent_rv))
+                for comp_dist in dependent_rv.owner.inputs[
+                    4 : -len(dependent_rv.owner.op.shared_inputs)
+                ]:
+                    new_comp_dist = conform_rv_shape(comp_dist, dependent_rv.shape)
+                    state_logp = logpt(new_comp_dist, dependent_rv.tag.observations)
+                    comp_logps.append(state_logp)
 
                 comp_logp_stacked = at.stack(comp_logps)
             else:
@@ -183,14 +223,18 @@ def __init__(self, vars, values=None, model=None):
             dep_comps_logp_stacked.append(comp_logp_stacked)
 
         comp_logp_stacked = at.sum(dep_comps_logp_stacked, axis=0)
+        self.log_lik_states = model.fn(comp_logp_stacked)
+
+        Gammas_var = var.owner.inputs[3]
+        gamma_0_var = var.owner.inputs[4]
 
-        (M,) = draw_values([var.distribution.gamma_0.shape[-1]], point=model.test_point)
-        N = model.test_point[var.name].shape[-1]
+        Gammas_initial_shape = model.fn(Gammas_var.shape)(model.initial_point)
+        M = Gammas_initial_shape[-1]
+        N = Gammas_initial_shape[-3]
         self.alphas = np.empty((M, N), dtype=float)
 
-        self.log_lik_states = model.fn(comp_logp_stacked)
-        self.gamma_0_fn = model.fn(var.distribution.gamma_0)
-        self.Gammas_fn = model.fn(var.distribution.Gammas)
+        self.gamma_0_fn = model.fn(gamma_0_var)
+        self.Gammas_fn = model.fn(Gammas_var)
 
     def step(self, point):
         gamma_0 = self.gamma_0_fn(point)
@@ -199,15 +243,19 @@ def step(self, point):
         # TODO: Can we update these in-place (e.g. using a shared variable)?
         log_lik_state_vals = self.log_lik_states(point)
         ffbs_step(
-            gamma_0, Gammas_t, log_lik_state_vals, self.alphas, point[self.vars[0].name]
+            gamma_0,
+            Gammas_t,
+            log_lik_state_vals,
+            self.alphas,
+            point[self.vars[0].name],
+            rng=self.rng,
         )
         return point
 
     @staticmethod
     def competence(var):
-        distribution = getattr(var.distribution, "parent_dist", var.distribution)
 
-        if isinstance(distribution, DiscreteMarkovChain):
+        if var.owner and isinstance(var.owner.op, DiscreteMarkovChainFactory):
             return Competence.IDEAL
         # elif isinstance(distribution, pm.Bernoulli) or (var.dtype in pm.bool_types):
         #     return Competence.COMPATIBLE
@@ -257,7 +305,7 @@ def __init__(self, model_vars, values=None, model=None, rng=None):
         if isinstance(model_vars, Variable):
             model_vars = [model_vars]
 
-        model_vars = list(chain.from_iterable([pm.inputvars(v) for v in model_vars]))
+        model_vars = list(model_vars)
 
         # TODO: Are the rows in this matrix our `dir_priors`?
         dir_priors = []
@@ -271,7 +319,7 @@ def __init__(self, model_vars, values=None, model=None, rng=None):
         state_seqs = [
             v
             for v in model.vars + model.observed_RVs
-            if isinstance(v.distribution, DiscreteMarkovChain)
+            if (v.owner.op and isinstance(v.owner.op, DiscreteMarkovChainFactory))
             and all(d in graph_inputs([v.distribution.Gammas]) for d in dir_priors)
         ]
 
@@ -300,7 +348,9 @@ def __init__(self, model_vars, values=None, model=None, rng=None):
         self.dists = list(dir_priors)
         self.state_seq_name = state_seq.name
 
-        super().__init__(dir_priors, [], allvars=True)
+        super().__init__(
+            [model.rvs_to_values[dp] for dp in dir_priors], [], allvars=True
+        )
 
     def _set_row_mappings(self, Gamma, dir_priors, model):
         """Create maps from Dirichlet priors parameters to rows and slices in the transition matrix.
@@ -444,11 +494,7 @@ def astep(self, point, inputs):
     @staticmethod
     def competence(var):
 
-        # TODO: Check that the dependent term is a conjugate type.
-
-        distribution = getattr(var.distribution, "parent_dist", var.distribution)
-
-        if isinstance(distribution, pm.Dirichlet):
+        if var.owner and isinstance(var.owner.op, pm.Dirichlet):
             return Competence.COMPATIBLE
 
         return Competence.INCOMPATIBLE
diff --git a/pymc3_hmm/utils.py b/pymc3_hmm/utils.py
index da0fb16..ce83dc2 100644
--- a/pymc3_hmm/utils.py
+++ b/pymc3_hmm/utils.py
@@ -1,24 +1,13 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
+import aesara.tensor as at
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy.special as sp
 from matplotlib import cm
 from matplotlib.axes import Axes
 from matplotlib.colors import Colormap
-from scipy.special import logsumexp
-
-try:  # pragma: no cover
-    import aesara.tensor as at
-    from aesara.tensor.extra_ops import broadcast_shape
-    from aesara.tensor.extra_ops import broadcast_to as at_broadcast_to
-    from aesara.tensor.var import TensorVariable
-except ImportError:  # pragma: no cover
-    import theano.tensor as at
-    from theano.tensor.extra_ops import broadcast_shape
-    from theano.tensor.extra_ops import broadcast_to as at_broadcast_to
-    from theano.tensor.var import TensorVariable
-
 
 vsearchsorted = np.vectorize(np.searchsorted, otypes=[int], signature="(n),()->()")
 
@@ -87,7 +76,7 @@ def compute_trans_freqs(states, N_states, counts_only=False):
     return res
 
 
-def tt_logsumexp(x, axis=None, keepdims=False):
+def logsumexp(x, axis=None, keepdims=False):
     """Construct a Theano graph for a log-sum-exp calculation."""
     x_max_ = at.max(x, axis=axis, keepdims=True)
 
@@ -114,7 +103,7 @@ def tt_logsumexp(x, axis=None, keepdims=False):
     return res + x_max_
 
 
-def tt_logdotexp(A, b):
+def logdotexp(A, b):
     """Construct a Theano graph for a numerically stable log-scale dot product.
 
     The result is more or less equivalent to `tt.log(tt.exp(A).dot(tt.exp(b)))`
@@ -131,11 +120,11 @@ def tt_logdotexp(A, b):
         sqz = True
 
     b_bcast = b.dimshuffle(shape_b)
-    res = tt_logsumexp(A_bcast + b_bcast, axis=1)
+    res = logsumexp(A_bcast + b_bcast, axis=1)
     return res.squeeze() if sqz else res
 
 
-def logdotexp(A, b):
+def np_logdotexp(A, b):
     """Compute a numerically stable log-scale dot product of NumPy values.
 
     The result is more or less equivalent to `np.log(np.exp(A).dot(np.exp(b)))`
@@ -149,7 +138,7 @@ def logdotexp(A, b):
 
     A_bcast = np.expand_dims(A, -1)
 
-    res = logsumexp(A_bcast + b_bcast, axis=1)
+    res = sp.logsumexp(A_bcast + b_bcast, axis=1)
     return res.squeeze() if sqz else res
 
 
@@ -177,19 +166,6 @@ def tt_expand_dims(x, dims):
     return x.dimshuffle(dim_range)
 
 
-def tt_broadcast_arrays(*args: TensorVariable):
-    """Broadcast any number of arrays against each other.
-
-    Parameters
-    ----------
-    `*args` : array_likes
-        The arrays to broadcast.
-
-    """
-    bcast_shape = broadcast_shape(*args)
-    return tuple(at_broadcast_to(a, bcast_shape) for a in args)
-
-
 def multilogit_inv(ys):
     """Compute the multilogit-inverse function for both NumPy and Theano arrays.
 
@@ -208,10 +184,10 @@ def multilogit_inv(ys):
     """
     if isinstance(ys, np.ndarray):
         lib = np
-        lib_logsumexp = logsumexp
+        lib_logsumexp = sp.logsumexp
     else:
         lib = at
-        lib_logsumexp = tt_logsumexp
+        lib_logsumexp = logsumexp
 
     # exp_ys = lib.exp(ys)
     # res = lib.concatenate([exp_ys, lib.ones(tuple(ys.shape)[:-1] + (1,))], axis=-1)
diff --git a/setup.py b/setup.py
index 01306ad..cfd6474 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,10 @@
     install_requires=[
         "numpy>=1.18.1",
         "scipy>=1.4.0",
-        "pymc3>=3.11.1,<4.0.0",
+        # XXX TODO: These are temporary and only for testing.
+        # "pymc3>=4.0.0",
+        "pymc3 @ git+https://github.com/brandonwillard/pymc3.git@main#egg=pymc3-4.0.0",  # noqa: E501
+        "aesara>=2.0.10",
     ],
     tests_require=["pytest"],
     long_description=open("README.md").read() if exists("README.md") else "",
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index d7551fe..934a69c 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -1,79 +1,76 @@
+import aesara
+import aesara.tensor as at
 import numpy as np
-
-try:
-    import aesara
-    import aesara.tensor as at
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
-
 import pymc3 as pm
-import pytest
+from aesara.compile.mode import get_mode
+from pymc3.distributions.logprob import logpt, logpt_sum
 
 from pymc3_hmm.distributions import (
     DiscreteMarkovChain,
     PoissonZeroProcess,
     SwitchingProcess,
-    distribution_subset_args,
+    discrete_mc_logp,
+    switching_process_logp,
 )
-from tests.utils import simulate_poiszero_hmm
+from tests.utils import assert_no_rvs, simulate_poiszero_hmm
 
 
-def test_DiscreteMarkovChain_str():
-    Gammas = at.as_tensor(np.eye(2)[None, ...], name="Gammas")
-    gamma_0 = at.as_tensor(np.r_[0, 1], name="gamma_0")
+def dmc_logp(rv_var, obs):
+    value_var = rv_var.tag.value_var
+    return discrete_mc_logp(
+        rv_var.owner.op, value_var, {value_var: obs}, *rv_var.owner.inputs[3:]
+    )
 
-    with pm.Model():
-        test_dist = DiscreteMarkovChain("P_rv", Gammas, gamma_0, shape=(2,))
 
-    assert str(test_dist) == "P_rv ~ DiscreteMarkovChain"
+def sp_logp(rv_var, obs):
+    value_var = rv_var.tag.value_var
+    return switching_process_logp(
+        rv_var.owner.op, value_var, {value_var: obs}, *rv_var.owner.inputs[3:]
+    )
 
 
 def test_DiscreteMarkovChain_random():
     # A single transition matrix and initial probabilities vector for each
     # element in the state sequence
-    test_Gamma = np.array([[[1.0, 0.0], [0.0, 1.0]]])
+    test_Gamma_base = np.array([[[1.0, 0.0], [0.0, 1.0]]])
+    test_Gamma = np.broadcast_to(test_Gamma_base, (10, 2, 2))
     test_gamma_0 = np.r_[0.0, 1.0]
-    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=10).random()
+
+    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0).eval()
     assert np.all(test_sample == 1)
 
     test_sample = DiscreteMarkovChain.dist(
-        test_Gamma, 1.0 - test_gamma_0, shape=10
-    ).random()
+        test_Gamma, 1.0 - test_gamma_0, size=10
+    ).eval()
     assert np.all(test_sample == 0)
-    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=10).random(
-        size=12
-    )
-    assert test_sample.shape == (
-        12,
-        10,
-    )
 
-    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=10).random(
-        size=2
-    )
+    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=12).eval()
+    assert test_sample.shape == (12, 10)
+
+    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=2).eval()
     assert np.array_equal(
         test_sample, np.stack([np.ones(10), np.ones(10)], 0).astype(int)
     )
 
     # Now, the same set-up, but--this time--generate two state sequences
     # samples
-    test_Gamma = np.array([[[0.8, 0.2], [0.2, 0.8]]])
+    test_Gamma_base = np.array([[[0.8, 0.2], [0.2, 0.8]]])
+    test_Gamma = np.broadcast_to(test_Gamma_base, (10, 2, 2))
     test_gamma_0 = np.r_[0.2, 0.8]
-    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=10).random(
-        size=2
-    )
+    test_sample = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=2).eval()
     # TODO: Fix the seed, and make sure there's at least one 0 and 1?
     assert test_sample.shape == (2, 10)
 
     # Two transition matrices--for two distinct state sequences--and one vector
     # of initial probs.
-    test_Gamma = np.stack(
+    test_Gamma_base = np.stack(
         [np.array([[[1.0, 0.0], [0.0, 1.0]]]), np.array([[[1.0, 0.0], [0.0, 1.0]]])]
     )
+    test_Gamma = np.broadcast_to(test_Gamma_base, (2, 10, 2, 2))
     test_gamma_0 = np.r_[0.0, 1.0]
-    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=(2, 10))
-    test_sample = test_dist.random()
+
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample, np.stack([np.ones(10), np.ones(10)], 0).astype(int)
     )
@@ -81,7 +78,8 @@ def test_DiscreteMarkovChain_random():
 
     # Now, the same set-up, but--this time--generate three state sequence
     # samples
-    test_sample = test_dist.random(size=3)
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=3)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample,
         np.tile(np.stack([np.ones(10), np.ones(10)], 0).astype(int), (3, 1, 1)),
@@ -90,12 +88,13 @@ def test_DiscreteMarkovChain_random():
 
     # Two transition matrices and initial probs. for two distinct state
     # sequences
-    test_Gamma = np.stack(
+    test_Gamma_base = np.stack(
         [np.array([[[1.0, 0.0], [0.0, 1.0]]]), np.array([[[1.0, 0.0], [0.0, 1.0]]])]
     )
+    test_Gamma = np.broadcast_to(test_Gamma_base, (2, 10, 2, 2))
     test_gamma_0 = np.stack([np.r_[0.0, 1.0], np.r_[1.0, 0.0]])
-    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=(2, 10))
-    test_sample = test_dist.random()
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample, np.stack([np.ones(10), np.zeros(10)], 0).astype(int)
     )
@@ -103,7 +102,8 @@ def test_DiscreteMarkovChain_random():
 
     # Now, the same set-up, but--this time--generate three state sequence
     # samples
-    test_sample = test_dist.random(size=3)
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=3)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample,
         np.tile(np.stack([np.ones(10), np.zeros(10)], 0).astype(int), (3, 1, 1)),
@@ -122,13 +122,14 @@ def test_DiscreteMarkovChain_random():
     )
     test_gamma_0 = np.r_[1, 0]
 
-    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=3)
-    test_sample = test_dist.random()
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0)
+    test_sample = test_dist.eval()
     assert np.array_equal(test_sample, np.r_[1, 0, 0])
 
     # Now, the same set-up, but--this time--generate three state sequence
     # samples
-    test_sample = test_dist.random(size=3)
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=3)
+    test_sample = test_dist.eval()
     assert np.array_equal(test_sample, np.tile(np.r_[1, 0, 0].astype(int), (3, 1)))
 
     # "Time"-varying transition matrices with two initial
@@ -143,13 +144,14 @@ def test_DiscreteMarkovChain_random():
     )
     test_gamma_0 = np.array([[1, 0], [0, 1]])
 
-    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=(2, 3))
-    test_sample = test_dist.random()
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0)
+    test_sample = test_dist.eval()
     assert np.array_equal(test_sample, np.array([[1, 0, 0], [0, 1, 1]]))
 
     # Now, the same set-up, but--this time--generate three state sequence
     # samples
-    test_sample = test_dist.random(size=3)
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=3)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample, np.tile(np.array([[1, 0, 0], [0, 1, 1]]).astype(int), (3, 1, 1))
     )
@@ -173,53 +175,52 @@ def test_DiscreteMarkovChain_random():
     )
     test_gamma_0 = np.array([[1, 0], [0, 1]])
 
-    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, shape=(2, 3))
-    test_sample = test_dist.random()
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0)
+    test_sample = test_dist.eval()
     assert np.array_equal(test_sample, np.array([[1, 0, 0], [1, 1, 0]]))
 
     # Now, the same set-up, but--this time--generate three state sequence
     # samples
-    test_sample = test_dist.random(size=3)
+    test_dist = DiscreteMarkovChain.dist(test_Gamma, test_gamma_0, size=3)
+    test_sample = test_dist.eval()
     assert np.array_equal(
         test_sample, np.tile(np.array([[1, 0, 0], [1, 1, 0]]).astype(int), (3, 1, 1))
     )
 
 
-def test_DiscreteMarkovChain_point():
-    test_Gammas = at.as_tensor_variable(np.array([[[1.0, 0.0], [0.0, 1.0]]]))
-
-    with pm.Model():
-        # XXX: `draw_values` won't use the `Deterministic`s values in the `point` map!
-        # Also, `Constant` is only for integer types (?!), so we can't use that.
-        test_gamma_0 = pm.Dirichlet("gamma_0", np.r_[1.0, 1000.0], shape=2)
-        test_point = {"gamma_0": np.r_[1.0, 0.0]}
-        assert np.all(
-            DiscreteMarkovChain.dist(test_Gammas, test_gamma_0, shape=10).random(
-                point=test_point
-            )
-            == 0
-        )
-        assert np.all(
-            DiscreteMarkovChain.dist(test_Gammas, 1.0 - test_gamma_0, shape=10).random(
-                point=test_point
-            )
-            == 1
-        )
+def test_DiscreteMarkovChain_model():
+    N = 50
 
+    with pm.Model(rng_seeder=np.random.RandomState(202353)):
+        p_0_rv = pm.Dirichlet("p_0", np.ones(2))
+        p_1_rv = pm.Dirichlet("p_1", np.ones(2))
 
-def test_DiscreteMarkovChain_logp():
-    aesara.config.compute_test_value = "warn"
+        P_tt = at.stack([p_0_rv, p_1_rv])
+        P_tt = at.broadcast_to(P_tt, (N,) + tuple(P_tt.shape))
+        P_rv = pm.Deterministic("P_tt", P_tt)
 
+        pi_0_tt = pm.Dirichlet("pi_0", np.ones(2))
+
+        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt)
+
+    mode = get_mode(None).excluding("random_make_inplace")
+    test_sample_fn = aesara.function([], [S_rv], mode=mode)
+
+    test_sample = test_sample_fn()
+
+    assert len(np.unique(test_sample)) > 1
+
+
+def test_DiscreteMarkovChain_logp():
     # A single transition matrix and initial probabilities vector for each
     # element in the state sequence
-    test_Gammas = np.array([[[0.0, 1.0], [1.0, 0.0]]])
+    test_Gammas_base = np.array([[[0.0, 1.0], [1.0, 0.0]]])
     test_gamma_0 = np.r_[1.0, 0.0]
     test_obs = np.r_[1, 0, 1, 0]
+    test_Gammas = np.broadcast_to(test_Gammas_base, (test_obs.shape[-1], 2, 2))
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
-    test_logp_tt = test_dist.logp(test_obs)
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_logp_tt = assert_no_rvs(logpt(test_dist, test_obs))
     assert test_logp_tt.eval() == 0
 
     # "Time"-varying transition matrices with a single vector of initial
@@ -233,37 +234,30 @@ def test_DiscreteMarkovChain_logp():
         ],
         axis=0,
     )
-
     test_gamma_0 = np.r_[1.0, 0.0]
-
     test_obs = np.r_[1, 0, 1, 0]
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
-
-    test_logp_tt = test_dist.logp(test_obs)
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_logp_tt = assert_no_rvs(logpt(test_dist, test_obs))
 
     assert test_logp_tt.eval() == 0
 
     # Static transition matrix and two state sequences
-    test_Gammas = np.array([[[0.0, 1.0], [1.0, 0.0]]])
-
+    test_Gammas_base = np.array([[[0.0, 1.0], [1.0, 0.0]]])
     test_obs = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
-
     test_gamma_0 = np.r_[0.5, 0.5]
+    test_Gammas = np.broadcast_to(test_Gammas_base, (test_obs.shape[-1], 2, 2))
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_dist.tag.value_var = test_dist.clone()
 
-    test_logp_tt = test_dist.logp(test_obs)
+    test_logp_tt = assert_no_rvs(dmc_logp(test_dist, test_obs))
 
     test_logp = test_logp_tt.eval()
     assert test_logp[0] == test_logp[1]
 
     # Time-varying transition matrices and two state sequences
-    test_Gammas = np.stack(
+    test_Gammas_base = np.stack(
         [
             np.array([[0.0, 1.0], [1.0, 0.0]]),
             np.array([[0.0, 1.0], [1.0, 0.0]]),
@@ -272,16 +266,13 @@ def test_DiscreteMarkovChain_logp():
         ],
         axis=0,
     )
-
     test_obs = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
-
     test_gamma_0 = np.r_[0.5, 0.5]
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_dist.tag.value_var = test_dist.clone()
 
-    test_logp_tt = test_dist.logp(test_obs)
+    test_logp_tt = assert_no_rvs(dmc_logp(test_dist, test_obs))
 
     test_logp = test_logp_tt.eval()
     assert test_logp[0] == test_logp[1]
@@ -304,16 +295,13 @@ def test_DiscreteMarkovChain_logp():
         ],
         axis=0,
     )
-
     test_obs = np.array([[1, 0, 1, 0], [0, 0, 0, 0]])
-
     test_gamma_0 = np.r_[0.5, 0.5]
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_dist.tag.value_var = test_dist.clone()
 
-    test_logp_tt = test_dist.logp(test_obs)
+    test_logp_tt = assert_no_rvs(dmc_logp(test_dist, test_obs))
 
     test_logp = test_logp_tt.eval()
     assert test_logp[0] == test_logp[1]
@@ -322,11 +310,10 @@ def test_DiscreteMarkovChain_logp():
     # broadcasting--and two state sequences
     test_gamma_0 = np.array([[0.5, 0.5], [0.5, 0.5]])
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_dist.tag.value_var = test_dist.clone()
 
-    test_logp_tt = test_dist.logp(test_obs)
+    test_logp_tt = assert_no_rvs(dmc_logp(test_dist, test_obs))
 
     test_logp = test_logp_tt.eval()
     assert test_logp[0] == test_logp[1]
@@ -342,16 +329,13 @@ def test_DiscreteMarkovChain_logp():
         ],
         axis=0,
     )
-
     test_gamma_0 = np.r_[0.3, 0.7]
-
     test_obs = np.r_[1, 0, 1, 0]
 
-    test_dist = DiscreteMarkovChain.dist(
-        test_Gammas, test_gamma_0, shape=test_obs.shape[-1]
-    )
+    test_dist = DiscreteMarkovChain.dist(test_Gammas, test_gamma_0)
+    test_dist.tag.value_var = test_dist.clone()
 
-    test_logp_tt = test_dist.logp(test_obs)
+    test_logp_tt = assert_no_rvs(dmc_logp(test_dist, test_obs))
 
     logp_res = test_logp_tt.eval()
 
@@ -366,37 +350,47 @@ def test_DiscreteMarkovChain_logp():
     logp_exp = np.log(logp_exp).sum()
     assert np.allclose(logp_res, logp_exp)
 
+    test_logp = assert_no_rvs(logpt_sum(test_dist, test_obs))
+    test_logp_val = test_logp.eval()
+    assert test_logp_val.shape == ()
+
 
 def test_SwitchingProcess_random():
     test_states = np.r_[0, 0, 1, 1, 0, 1]
     mu_zero_nonzero = [pm.Constant.dist(0), pm.Constant.dist(1)]
     test_dist = SwitchingProcess.dist(mu_zero_nonzero, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
-    test_sample = test_dist.random()
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
+    test_sample = test_dist.eval()
     assert test_sample.shape == (test_states.shape[0],)
     assert np.all(test_sample[test_states > 0] > 0)
 
-    test_sample = test_dist.random(size=5)
+    test_sample = SwitchingProcess.dist(mu_zero_nonzero, test_states, size=5).eval()
     assert np.array_equal(test_sample.shape, (5,) + test_states.shape)
     assert np.all(test_sample[..., test_states > 0] > 0)
 
-    test_states = np.r_[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]
+    test_states = at.lvector("states")
+    test_states.tag.test_value = np.r_[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]
     test_dist = SwitchingProcess.dist(mu_zero_nonzero, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
-    test_sample = test_dist.random(size=1)
-    assert np.array_equal(test_sample.shape, (1,) + test_states.shape)
-    assert np.all(test_sample[..., test_states > 0] > 0)
+    assert np.array_equal(
+        test_dist.shape.eval({test_states: test_states.tag.test_value}),
+        test_states.tag.test_value.shape,
+    )
+    test_sample = SwitchingProcess.dist(mu_zero_nonzero, test_states, size=1).eval(
+        {test_states: test_states.tag.test_value}
+    )
+    assert np.array_equal(test_sample.shape, (1,) + test_states.tag.test_value.shape)
+    assert np.all(test_sample[..., test_states.tag.test_value > 0] > 0)
 
     test_states = np.r_[0, 0, 1, 1, 0, 1]
     test_mus = [pm.Constant.dist(i) for i in range(6)]
     test_dist = SwitchingProcess.dist(test_mus, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
-    test_sample = test_dist.random()
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
+    test_sample = test_dist.eval()
     assert np.array_equal(test_sample.shape, test_states.shape)
     assert np.all(test_sample[..., test_states > 0] > 0)
 
     test_states = np.c_[0, 0, 1, 1, 0, 1].T
-    test_mus = np.arange(1, 6).astype(float)
+    test_mus = np.arange(1, 6).astype(np.float64)
     # One of the states has emissions that are a sequence of five Dirac delta
     # distributions on the values 1 to 5 (i.e. the one with values
     # `test_mus`), and the other is just a single delta at 0.  A single state
@@ -408,60 +402,39 @@ def test_SwitchingProcess_random():
     test_dist = SwitchingProcess.dist(
         [pm.Constant.dist(0), pm.Constant.dist(test_mus)], test_states
     )
-    assert np.array_equal(test_dist.shape, (6, 5))
-    test_sample = test_dist.random()
-    assert np.array_equal(test_sample.shape, test_dist.shape)
-    assert np.all(test_sample[np.where(test_states > 0)[0]] == test_mus)
+    assert np.array_equal(test_dist.shape.eval(), (6, 5))
+    test_sample = test_dist.eval()
+    assert np.array_equal(test_sample.shape, test_dist.shape.eval())
+    sample_mus = test_sample[np.where(test_states > 0)[0]]
+    assert np.all(sample_mus == test_mus)
 
     test_states = np.c_[0, 0, 1, 1, 0, 1]
-    test_mus = np.arange(1, 7).astype(float)
+    test_mus = np.arange(1, 7).astype(np.float64)
     test_dist = SwitchingProcess.dist(
         [pm.Constant.dist(0), pm.Constant.dist(test_mus)], test_states
     )
-    assert np.array_equal(test_dist.shape, test_states.shape)
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
 
     test_states = np.r_[0, 0, 1, 1, 0, 1]
     test_sample = SwitchingProcess.dist(
-        [pm.Constant.dist(0), pm.Constant.dist(test_mus)], test_states
-    ).random(size=3)
+        [pm.Constant.dist(0), pm.Constant.dist(test_mus)], test_states, size=3
+    ).eval()
     assert np.array_equal(test_sample.shape, (3,) + test_mus.shape)
     assert np.all(test_sample.sum(0)[..., test_states > 0] > 0)
 
-
-def test_PoissonZeroProcess_point():
-    test_states = np.r_[0, 0, 1, 1, 0, 1]
-
-    with pm.Model():
-        test_mean = pm.Constant("c", 1000.0)
-        test_point = {"c": 100.0}
-        test_sample = PoissonZeroProcess.dist(test_mean, test_states).random(
-            point=test_point
-        )
-
-    assert np.all(0 < test_sample[..., test_states > 0])
-    assert np.all(test_sample[..., test_states > 0] < 200)
-
-
-def test_random_PoissonZeroProcess_DiscreteMarkovChain():
-    poiszero_sim, test_model = simulate_poiszero_hmm(30, 5000)
-
-    y_test = poiszero_sim["Y_t"].squeeze()
-    nonzeros_idx = poiszero_sim["S_t"] > 0
-
-    assert np.all(y_test[nonzeros_idx] > 0)
-    assert np.all(y_test[~nonzeros_idx] == 0)
-
-
-def test_SwitchingProcess():
-
-    np.random.seed(2023532)
+    # Some misc. tests
+    rng = aesara.shared(np.random.RandomState(2023532), borrow=True)
 
     test_states = np.r_[2, 0, 1, 2, 0, 1]
-    test_dists = [pm.Constant.dist(0), pm.Poisson.dist(100.0), pm.Poisson.dist(1000.0)]
+    test_dists = [
+        pm.Constant.dist(0),
+        pm.Poisson.dist(100.0, rng=rng),
+        pm.Poisson.dist(1000.0, rng=rng),
+    ]
     test_dist = SwitchingProcess.dist(test_dists, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
 
-    test_sample = test_dist.random()
+    test_sample = test_dist.eval()
     assert test_sample.shape == (test_states.shape[0],)
     assert np.all(test_sample[test_states == 0] == 0)
     assert np.all(0 < test_sample[test_states == 1])
@@ -471,37 +444,29 @@ def test_SwitchingProcess():
     test_mus = np.r_[100, 100, 500, 100, 100, 100]
     test_dists = [
         pm.Constant.dist(0),
-        pm.Poisson.dist(test_mus),
-        pm.Poisson.dist(10000.0),
+        pm.Poisson.dist(test_mus, rng=rng),
+        pm.Poisson.dist(10000.0, rng=rng),
     ]
     test_dist = SwitchingProcess.dist(test_dists, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
 
-    test_sample = test_dist.random()
+    test_sample = test_dist.eval()
     assert test_sample.shape == (test_states.shape[0],)
     assert np.all(200 < test_sample[2] < 600)
     assert np.all(0 < test_sample[5] < 200)
     assert np.all(5000 < test_sample[test_states == 2])
 
-    test_dists = [pm.Constant.dist(0), pm.Poisson.dist(100.0), pm.Poisson.dist(1000.0)]
-    test_dist = SwitchingProcess.dist(test_dists, test_states)
-    for i in range(len(test_dists)):
-        test_logp = test_dist.logp(
-            np.tile(test_dists[i].mode.eval(), test_states.shape)
-        ).eval()
-        assert test_logp[test_states != i].max() < test_logp[test_states == i].min()
-
     # Try a continuous mixture
     test_states = np.r_[2, 0, 1, 2, 0, 1]
     test_dists = [
-        pm.Normal.dist(0.0, 1.0),
-        pm.Normal.dist(100.0, 1.0),
-        pm.Normal.dist(1000.0, 1.0),
+        pm.Normal.dist(0.0, 1.0, rng=rng),
+        pm.Normal.dist(100.0, 1.0, rng=rng),
+        pm.Normal.dist(1000.0, 1.0, rng=rng),
     ]
     test_dist = SwitchingProcess.dist(test_dists, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
 
-    test_sample = test_dist.random()
+    test_sample = test_dist.eval()
     assert test_sample.shape == (test_states.shape[0],)
     assert np.all(test_sample[test_states == 0] < 10)
     assert np.all(50 < test_sample[test_states == 1])
@@ -512,79 +477,106 @@ def test_SwitchingProcess():
     test_states = np.ones(50)
     test_dists = [pm.Constant.dist(i) for i in range(50)]
     test_dist = SwitchingProcess.dist(test_dists, test_states)
-    assert np.array_equal(test_dist.shape, test_states.shape)
+    assert np.array_equal(test_dist.shape.eval(), test_states.shape)
+
 
-    with pytest.raises(TypeError):
-        SwitchingProcess.dist([1], test_states)
+def test_SwitchingProcess_logp():
 
-    with aesara.change_flags(compute_test_value="off"):
-        # Test for the case when a default can't be computed
-        test_dist = pm.Poisson.dist(at.scalar())
+    rng = aesara.shared(np.random.RandomState(2023532), borrow=True)
 
-        # Confirm that there's no default
-        with pytest.raises(AttributeError):
-            test_dist.default()
+    test_states = np.r_[2, 0, 1, 2, 0, 1]
+    test_comp_dists = [
+        pm.Constant.dist(0),
+        pm.Poisson.dist(100.0, rng=rng),
+        pm.Poisson.dist(1000.0, rng=rng),
+    ]
 
-        # Let it try to sample using `Distribution.random` and fail
-        with pytest.raises(ValueError):
-            SwitchingProcess.dist([test_dist], test_states)
+    test_dist = SwitchingProcess.dist(test_comp_dists, test_states)
+    test_dist.tag.value_var = test_dist.clone()
+
+    for i in range(len(test_comp_dists)):
+        obs = np.tile(test_comp_dists[i].owner.inputs[3].eval(), test_states.shape)
+        test_logp = assert_no_rvs(sp_logp(test_dist, obs)).eval()
+        assert test_logp[test_states != i].max() < test_logp[test_states == i].min()
 
     # Evaluate multiple observed state sequences in an extreme case
     test_states = at.imatrix("states")
     test_states.tag.test_value = np.zeros((10, 4)).astype("int32")
+
     test_dist = SwitchingProcess.dist(
         [pm.Constant.dist(0), pm.Constant.dist(1)], test_states
     )
+    test_dist.tag.value_var = test_dist.clone()
+
     test_obs = np.tile(np.arange(4), (10, 1)).astype("int32")
-    test_logp = test_dist.logp(test_obs)
+    test_logp = assert_no_rvs(sp_logp(test_dist, test_obs))
     exp_logp = np.tile(
         np.array([0.0] + [-np.inf] * 3, dtype=aesara.config.floatX), (10, 1)
     )
-    assert np.array_equal(test_logp.tag.test_value, exp_logp)
-
-
-def test_subset_args():
-
-    test_dist = pm.Constant.dist(c=np.r_[0.1, 1.2, 2.3])
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx)
-    assert np.array_equal(res[0].eval(), np.r_[0.1, 2.3])
-
-    test_point = {"c": np.r_[2.0, 3.0, 4.0]}
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx, point=test_point)
-    assert np.array_equal(res[0].eval(), np.r_[2.0, 4.0])
-
-    test_dist = pm.Normal.dist(mu=np.r_[0.1, 1.2, 2.3], sigma=np.r_[10.0])
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx)
-    assert np.array_equal(res[0].eval(), np.r_[0.1, 2.3])
-    assert np.array_equal(res[1].eval(), np.r_[10.0, 10.0])
-
-    test_point = {"mu": np.r_[2.0, 3.0, 4.0], "sigma": np.r_[20.0, 30.0, 40.0]}
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx, point=test_point)
-    assert np.array_equal(res[0].eval(), np.r_[2.0, 4.0])
-    assert np.array_equal(res[1].eval(), np.r_[20.0, 40.0])
-
-    test_dist = pm.Poisson.dist(mu=np.r_[0.1, 1.2, 2.3])
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx)
-    assert np.array_equal(res[0].eval(), np.r_[0.1, 2.3])
-
-    test_point = {"mu": np.r_[2.0, 3.0, 4.0]}
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx, point=test_point)
-    assert np.array_equal(res[0].eval(), np.r_[2.0, 4.0])
-
-    test_dist = pm.NegativeBinomial.dist(mu=np.r_[0.1, 1.2, 2.3], alpha=2)
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx)
-    assert np.array_equal(res[0].eval(), np.r_[0.1, 2.3])
-    assert np.array_equal(res[1].eval(), np.r_[2.0, 2.0])
-
-    test_point = {"mu": np.r_[2.0, 3.0, 4.0], "alpha": np.r_[10, 11, 12]}
-    test_idx = np.r_[0, 2]
-    res = distribution_subset_args(test_dist, shape=[3], idx=test_idx, point=test_point)
-    assert np.array_equal(res[0].eval(), np.r_[2.0, 4.0])
-    assert np.array_equal(res[1].eval(), np.r_[10, 12])
+    assert np.array_equal(
+        test_logp.eval({test_states: test_states.tag.test_value}), exp_logp
+    )
+
+    np.random.seed(4343)
+    test_states = at.lvector("states")
+    test_states.tag.test_value = np.random.randint(0, 2, size=10, dtype=np.int64)
+    test_dist = SwitchingProcess.dist(
+        [
+            pm.Constant.dist(0),
+            pm.Poisson.dist(at.arange(test_states.shape[0]), rng=rng),
+        ],
+        test_states,
+    )
+    test_dist.tag.value_var = test_dist.clone()
+
+    test_obs = np.stack([np.zeros(10), np.random.poisson(np.arange(10))])
+    test_obs = test_obs[
+        (test_states.tag.test_value,) + tuple(np.ogrid[:2, :10])[1:]
+    ].squeeze()
+
+    test_logp = assert_no_rvs(sp_logp(test_dist, test_obs))
+    test_logp_val = test_logp.eval({test_states: test_states.tag.test_value})
+    assert test_logp_val.shape == (10,)
+
+    test_logp = assert_no_rvs(logpt_sum(test_dist, test_obs))
+    test_logp_val = test_logp.eval({test_states: test_states.tag.test_value})
+    assert test_logp_val.shape == ()
+
+
+def test_PoissonZeroProcess_model():
+    with pm.Model(rng_seeder=np.random.RandomState(2023532)):
+        test_mean = pm.Constant("c", 1000.0)
+        states = pm.Bernoulli("states", 0.5, size=10)
+        Y = PoissonZeroProcess.dist(test_mean, states)
+
+    # We want to make sure that the sampled states and observations correspond,
+    # because, if there are any zero states with non-zero observations, we know
+    # that the sampled states weren't actually used to draw the observations,
+    # and that's a big problem
+    sample_fn = aesara.function([], [states, Y])
+
+    fgraph = sample_fn.maker.fgraph
+    nodes = list(fgraph.apply_nodes)
+    bernoulli_nodes = set(
+        n for n in nodes if isinstance(n.op, type(at.random.bernoulli))
+    )
+    assert len(bernoulli_nodes) == 1
+
+    for i in range(100):
+        test_states, test_Y = sample_fn()
+        assert np.all(0 < test_Y[..., test_states > 0])
+        assert np.all(test_Y[..., test_states > 0] < 10000)
+
+
+def test_random_PoissonZeroProcess_DiscreteMarkovChain():
+    rng = np.random.RandomState(230)
+
+    poiszero_sim, test_model = simulate_poiszero_hmm(30, 5000, rng=rng)
+
+    assert poiszero_sim.keys() == {"P_tt", "S_t", "p_1", "p_0", "Y_t", "pi_0"}
+
+    y_test = poiszero_sim["Y_t"].squeeze()
+    nonzeros_idx = poiszero_sim["S_t"] > 0
+
+    assert np.all(y_test[nonzeros_idx] > 0)
+    assert np.all(y_test[~nonzeros_idx] == 0)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 4441089..1674e73 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1,24 +1,13 @@
 from datetime import date, timedelta
 
+import aesara
+import aesara.tensor as at
 import arviz as az
 import numpy as np
 import pandas as pd
 import patsy
-
-try:
-    import aesara
-    import aesara.tensor as at
-    from aesara import shared
-
-    TV_CONFIG = {"aesara_config": {"compute_test_value": "ignore"}}
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
-    from theano import shared
-
-    TV_CONFIG = {"theano_config": {"compute_test_value": "ignore"}}
-
 import pymc3 as pm
+from aesara import shared
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, SwitchingProcess
 from pymc3_hmm.step_methods import FFBSStep
@@ -40,20 +29,20 @@ def gen_toy_data(days=-7 * 10):
 
 def create_dirac_zero_hmm(X, mu, xis, observed):
     S = 2
-    z_tt = at.stack([at.dot(X, xis[..., s, :]) for s in range(S)], axis=1)
-    Gammas_tt = pm.Deterministic("Gamma", multilogit_inv(z_tt))
-    gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)), shape=S)
+    # z_at = at.stack([at.dot(X, xis[..., s, :]) for s in range(S)], axis=1)
+    z_at = at.tensordot(X, xis, axes=((1,), (0,)))
+    z_at.name = "z"
 
-    if type(observed) == np.ndarray:
-        T = X.shape[0]
-    else:
-        T = X.get_value().shape[0]
+    Gammas_at = pm.Deterministic("Gamma", multilogit_inv(z_at))
+    gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)))
 
-    V_rv = DiscreteMarkovChain("V_t", Gammas_tt, gamma_0_rv, shape=T)
     if type(observed) == np.ndarray:
-        V_rv.tag.test_value = (observed > 0) * 1
+        V_initval = (observed > 0) * 1
     else:
-        V_rv.tag.test_value = (observed.get_value() > 0) * 1
+        V_initval = (observed.get_value() > 0) * 1
+
+    V_rv = DiscreteMarkovChain("V_t", Gammas_at, gamma_0_rv, initval=V_initval)
+
     Y_rv = SwitchingProcess(
         "Y_t",
         [pm.Constant.dist(0), pm.Constant.dist(mu)],
@@ -64,22 +53,25 @@ def create_dirac_zero_hmm(X, mu, xis, observed):
 
 
 def test_only_positive_state():
+    rng = np.random.RandomState(4284)
+
     number_of_draws = 50
     S = 2
     mu = 10
     y_t = np.repeat(0, 100)
 
-    with pm.Model():
-        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+    with pm.Model(rng_seeder=rng):
+        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        Gammas_tt = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_at = at.stack([p_0_rv, p_1_rv])
+        Gammas_at = pm.Deterministic(
+            "P", at.broadcast_to(P_at, (y_t.shape[0],) + tuple(P_at.shape))
+        )
 
-        gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)), shape=S)
+        gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)))
 
-        V_rv = DiscreteMarkovChain("V_t", Gammas_tt, gamma_0_rv, shape=y_t.shape[0])
-        V_rv.tag.test_value = (y_t > 0) * 1
+        V_rv = DiscreteMarkovChain("V_t", Gammas_at, gamma_0_rv, initval=(y_t > 0) * 1)
 
         _ = SwitchingProcess(
             "Y_t",
@@ -92,7 +84,7 @@ def test_only_positive_state():
             chains=1,
             draws=number_of_draws,
             return_inferencedata=True,
-            step=FFBSStep([V_rv]),
+            step=FFBSStep([V_rv], rng=rng),
         )
 
         posterior_pred_trace = pm.sample_posterior_predictive(
@@ -102,8 +94,7 @@ def test_only_positive_state():
 
 
 def test_time_varying_model():
-
-    np.random.seed(1039)
+    rng = np.random.RandomState(1039)
 
     data = gen_toy_data()
 
@@ -118,7 +109,7 @@ def test_time_varying_model():
 
     xis_rv_true = np.stack([xi_0_true, xi_1_true], axis=1)
 
-    with pm.Model(**TV_CONFIG) as sim_model:
+    with pm.Model(rng_seeder=rng) as sim_model:
         _ = create_dirac_zero_hmm(
             X_np, mu=1000, xis=xis_rv_true, observed=np.zeros(X_np.shape[0])
         )
@@ -136,18 +127,21 @@ def test_time_varying_model():
     Y = shared(train_y, name="y_t", borrow=True)
 
     with pm.Model() as model:
-        xis_rv = pm.Normal("xis", 0, 10, shape=xis_rv_true.shape)
+        xis_rv = pm.Normal("xis", 0, 10, size=xis_rv_true.shape)
         _ = create_dirac_zero_hmm(X, 1000, xis_rv, Y)
 
     number_of_draws = 500
 
     with model:
         steps = [
-            FFBSStep([model.V_t]),
+            FFBSStep([model.V_t], rng=rng),
             pm.NUTS(
                 vars=[
                     model.gamma_0,
-                    model.Gamma,
+                    # TODO FIXME: Using `model.Gamma` here fails.  This looks
+                    # like a v4 bug.  It should provide a better error than
+                    # the one it gave, at the very least.
+                    model.xis,
                 ],
                 target_accept=0.90,
             ),
@@ -157,7 +151,7 @@ def test_time_varying_model():
         posterior_trace = pm.sample(
             draws=number_of_draws,
             step=steps,
-            random_seed=100,
+            random_seed=1030,
             return_inferencedata=True,
             chains=1,
             cores=1,
@@ -165,28 +159,23 @@ def test_time_varying_model():
             idata_kwargs={"dims": {"Y_t": ["date"], "V_t": ["date"]}},
         )
 
-    # Update the shared variable values
-    Y.set_value(np.ones(test_X.shape[0], dtype=Y.dtype))
-    X.set_value(test_X)
-
-    model.V_t.distribution.shape = (test_X.shape[0],)
-
     hdi_data = az.hdi(posterior_trace, hdi_prob=0.95, var_names=["xis"]).to_dataframe()
     hdi_data = hdi_data.unstack(level="hdi")
 
     xis_true_flat = xis_rv_true.squeeze().flatten()
-    check_idx = ~np.in1d(
-        np.arange(len(xis_true_flat)), np.arange(3, len(xis_true_flat), step=4)
-    )
-    assert np.all(
-        xis_true_flat[check_idx] <= hdi_data["xis", "higher"].values[check_idx]
-    )
-    assert np.all(
-        xis_true_flat[check_idx] >= hdi_data["xis", "lower"].values[check_idx]
+    true_xis_under = xis_true_flat <= hdi_data["xis", "higher"].values
+    true_xis_above = xis_true_flat >= hdi_data["xis", "lower"].values
+
+    assert np.sum(~(true_xis_under ^ true_xis_above)) > int(
+        len(true_xis_under) * 2 / 3.0
     )
 
     trace = posterior_trace.posterior.drop_vars(["Gamma", "V_t"])
 
+    # Update the shared variable values for out-of-sample predictions
+    Y.set_value(np.ones(test_X.shape[0], dtype=Y.dtype))
+    X.set_value(test_X)
+
     with aesara.config.change_flags(compute_test_value="off"):
         adds_pois_ppc = pm.sample_posterior_predictive(
             trace, var_names=["V_t", "Y_t", "Gamma"], model=model
diff --git a/tests/test_step_methods.py b/tests/test_step_methods.py
index bdf05fa..93bd3ef 100644
--- a/tests/test_step_methods.py
+++ b/tests/test_step_methods.py
@@ -1,38 +1,47 @@
 import warnings
 
+import aesara
+import aesara.tensor as at
 import numpy as np
-
-try:
-    import aesara.tensor as at
-    from aesara.graph.op import get_test_value
-except ImportError:
-    import theano.tensor as at
-    from theano.graph.op import get_test_value
-
 import pymc3 as pm
 import pytest
 import scipy as sp
+from aesara.graph.op import get_test_value
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, PoissonZeroProcess
 from pymc3_hmm.step_methods import FFBSStep, TransMatConjugateStep, ffbs_step
 from pymc3_hmm.utils import compute_steady_state, compute_trans_freqs
 from tests.utils import simulate_poiszero_hmm
 
+# @pytest.fixture()
+# def raise_under_overflow():
+#     with np.errstate(over="raise", under="raise"):
+#         yield
 
-@pytest.fixture()
-def raise_under_overflow():
-    with np.errstate(over="raise", under="raise"):
+@pytest.fixture(scope="module", autouse=True)
+def set_aesara_flags():
+    with aesara.config.change_flags(on_opt_error="raise", on_shape_error="raise"):
         yield
 
 
 # All tests in this module will raise on over- and under-flows (unless local
 # settings dictate otherwise)
-pytestmark = pytest.mark.usefixtures("raise_under_overflow")
+# pytestmark = pytest.mark.usefixtures("raise_under_overflow")
 
 
-def test_ffbs_step():
+def transform_var(model, rv_var):
+    value_var = model.rvs_to_values[rv_var]
+    transform = getattr(value_var.tag, "transform", None)
+    if transform is not None:
+        untrans_value_var = transform.forward(rv_var, value_var)
+        untrans_value_var.name = rv_var.name
+        return untrans_value_var
+    else:
+        return value_var
 
-    np.random.seed(2032)
+
+def test_ffbs_step():
+    rng = np.random.RandomState(2032)
 
     # A single transition matrix and initial probabilities vector for each
     # element in the state sequence
@@ -44,7 +53,7 @@ def test_ffbs_step():
     )
     alphas = np.empty(test_log_lik_0.shape)
     res = np.empty(test_log_lik_0.shape[-1])
-    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_0, alphas, res)
+    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_0, alphas, res, rng)
     assert np.all(res == 0)
 
     test_log_lik_1 = np.stack(
@@ -52,15 +61,15 @@ def test_ffbs_step():
     )
     alphas = np.empty(test_log_lik_1.shape)
     res = np.empty(test_log_lik_1.shape[-1])
-    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_1, alphas, res)
+    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_1, alphas, res, rng)
     assert np.all(res == 1)
 
     # A well-separated mixture with non-degenerate likelihoods
-    test_seq = np.random.choice(2, size=10000)
+    test_seq = rng.choice(2, size=10000)
     test_obs = np.where(
         np.logical_not(test_seq),
-        np.random.poisson(10, 10000),
-        np.random.poisson(50, 10000),
+        rng.poisson(10, 10000),
+        rng.poisson(50, 10000),
     )
     test_log_lik_p = np.stack(
         [sp.stats.poisson.logpmf(test_obs, 10), sp.stats.poisson.logpmf(test_obs, 50)],
@@ -71,7 +80,7 @@ def test_ffbs_step():
 
     alphas = np.empty(test_log_lik_p.shape)
     res = np.empty(test_log_lik_p.shape[-1])
-    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_p, alphas, res)
+    ffbs_step(test_gamma_0, test_Gammas, test_log_lik_p, alphas, res, rng)
     # TODO FIXME: This is a statistically unsound/unstable check.
     assert np.mean(np.abs(res - test_seq)) < 1e-2
 
@@ -95,61 +104,73 @@ def test_ffbs_step():
 
     alphas = np.empty(test_log_lik.shape)
     res = np.empty(test_log_lik.shape[-1])
-    ffbs_step(test_gamma_0, test_Gammas, test_log_lik, alphas, res)
+    ffbs_step(test_gamma_0, test_Gammas, test_log_lik, alphas, res, rng)
     assert np.array_equal(res, np.r_[1, 0, 0, 1])
 
 
-def test_FFBSStep():
+def test_FFBSStep_errors():
 
     with pm.Model(), pytest.raises(ValueError):
-        P_rv = np.eye(2)[None, ...]
-        S_rv = DiscreteMarkovChain("S_t", P_rv, np.r_[1.0, 0.0], shape=10)
-        S_2_rv = DiscreteMarkovChain("S_2_t", P_rv, np.r_[0.0, 1.0], shape=10)
+        P_rv = np.broadcast_to(np.eye(2), (10, 2, 2))
+        S_rv = DiscreteMarkovChain("S_t", P_rv, np.r_[1.0, 0.0])
+        S_2_rv = DiscreteMarkovChain("S_2_t", P_rv, np.r_[0.0, 1.0])
         PoissonZeroProcess(
             "Y_t", 9.0, S_rv + S_2_rv, observed=np.random.poisson(9.0, size=10)
         )
         # Only one variable can be sampled by this step method
-        ffbs = FFBSStep([S_rv, S_2_rv])
+        FFBSStep([S_rv, S_2_rv])
 
     with pm.Model(), pytest.raises(TypeError):
-        S_rv = pm.Categorical("S_t", np.r_[1.0, 0.0], shape=10)
+        S_rv = pm.Categorical("S_t", np.r_[1.0, 0.0], size=10)
         PoissonZeroProcess("Y_t", 9.0, S_rv, observed=np.random.poisson(9.0, size=10))
         # Only `DiscreteMarkovChains` can be sampled with this step method
-        ffbs = FFBSStep([S_rv])
+        FFBSStep([S_rv])
 
     with pm.Model(), pytest.raises(TypeError):
-        P_rv = np.eye(2)[None, ...]
-        S_rv = DiscreteMarkovChain("S_t", P_rv, np.r_[1.0, 0.0], shape=10)
+        P_rv = np.broadcast_to(np.eye(2), (10, 2, 2))
+        S_rv = DiscreteMarkovChain("S_t", P_rv, np.r_[1.0, 0.0])
         pm.Poisson("Y_t", S_rv, observed=np.random.poisson(9.0, size=10))
         # Only `SwitchingProcess`es can used as dependent variables
-        ffbs = FFBSStep([S_rv])
+        FFBSStep([S_rv])
+
 
-    np.random.seed(2032)
+def test_FFBSStep_sim():
+    rng = np.random.RandomState(2031)
 
-    poiszero_sim, _ = simulate_poiszero_hmm(30, 150)
+    poiszero_sim, _ = simulate_poiszero_hmm(30, 150, rng=rng)
     y_test = poiszero_sim["Y_t"]
 
-    with pm.Model() as test_model:
-        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+    with pm.Model(rng_seeder=rng) as test_model:
+        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
         P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
 
-        pi_0_tt = compute_steady_state(P_rv)
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt, (y_test.shape[0],) + tuple(P_tt.shape))
+        )
+
+        pi_0_tt = compute_steady_state(P_tt)
 
-        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=y_test.shape[0])
+        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt)
 
         PoissonZeroProcess("Y_t", 9.0, S_rv, observed=y_test)
 
     with test_model:
-        ffbs = FFBSStep([S_rv])
+        ffbs = FFBSStep([S_rv], rng=rng)
 
-    test_point = test_model.test_point.copy()
-    test_point["p_0_stickbreaking__"] = poiszero_sim["p_0_stickbreaking__"]
-    test_point["p_1_stickbreaking__"] = poiszero_sim["p_1_stickbreaking__"]
+    p_0_stickbreaking__fn = aesara.function(
+        [test_model.rvs_to_values[p_0_rv]], transform_var(test_model, p_0_rv)
+    )
+    p_1_stickbreaking__fn = aesara.function(
+        [test_model.rvs_to_values[p_1_rv]], transform_var(test_model, p_1_rv)
+    )
 
-    res = ffbs.step(test_point)
+    initial_point = test_model.initial_point.copy()
+    initial_point["p_0_stickbreaking__"] = p_0_stickbreaking__fn(poiszero_sim["p_0"])
+    initial_point["p_1_stickbreaking__"] = p_1_stickbreaking__fn(poiszero_sim["p_1"])
+
+    res = ffbs.step(initial_point)
 
     assert np.array_equal(res["S_t"], poiszero_sim["S_t"])
 
@@ -157,39 +178,37 @@ def test_FFBSStep():
 def test_FFBSStep_extreme():
     """Test a long series with extremely large mixture separation (and, thus, very small likelihoods)."""  # noqa: E501
 
-    np.random.seed(2032)
+    rng = np.random.RandomState(222)
 
     mu_true = 5000
-    poiszero_sim, _ = simulate_poiszero_hmm(9000, mu_true)
+    poiszero_sim, _ = simulate_poiszero_hmm(9000, mu_true, rng=rng)
     y_test = poiszero_sim["Y_t"]
 
-    with pm.Model() as test_model:
+    with pm.Model(rng_seeder=rng) as test_model:
         p_0_rv = poiszero_sim["p_0"]
         p_1_rv = poiszero_sim["p_1"]
 
         P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt, (y_test.shape[0],) + tuple(P_tt.shape))
+        )
 
         pi_0_tt = poiszero_sim["pi_0"]
 
-        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=y_test.shape[0])
+        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt)
         S_rv.tag.test_value = (y_test > 0).astype(int)
 
         # This prior is very far from the true value...
-        E_mu, Var_mu = 100.0, 10000.0
+        E_mu, Var_mu = 500.0, 10000.0
         mu_rv = pm.Gamma("mu", E_mu ** 2 / Var_mu, E_mu / Var_mu)
 
         PoissonZeroProcess("Y_t", mu_rv, S_rv, observed=y_test)
 
     with test_model:
-        ffbs = FFBSStep([S_rv])
-
-    test_point = test_model.test_point.copy()
-    test_point["p_0_stickbreaking__"] = poiszero_sim["p_0_stickbreaking__"]
-    test_point["p_1_stickbreaking__"] = poiszero_sim["p_1_stickbreaking__"]
+        ffbs = FFBSStep([S_rv], rng=rng)
 
     with np.errstate(over="ignore", under="ignore"):
-        res = ffbs.step(test_point)
+        res = ffbs.step(test_model.initial_point)
 
     assert np.array_equal(res["S_t"], poiszero_sim["S_t"])
 
@@ -203,50 +222,53 @@ def test_FFBSStep_extreme():
         ffbs = FFBSStep([S_rv])
         steps = [ffbs, mu_step]
         trace = pm.sample(
-            20,
+            50,
             step=steps,
             cores=1,
             chains=1,
             tune=100,
             n_init=100,
-            progressbar=False,
+            progressbar=True,
         )
 
-        assert not trace.get_sampler_stats("diverging").all()
-        assert trace["mu"].mean() > 1000.0
+        assert not trace.sample_stats.diverging.all()
+        assert trace.posterior.mu.mean() > 1000.0
 
 
+@pytest.mark.xfail(reason="Not refactored for v4, yet.")
 def test_TransMatConjugateStep():
 
     with pm.Model() as test_model, pytest.raises(ValueError):
-        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
+        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
         transmat = TransMatConjugateStep(p_0_rv)
 
-    np.random.seed(2032)
+    rng = aesara.shared(np.random.RandomState(2032), borrow=True)
 
-    poiszero_sim, _ = simulate_poiszero_hmm(30, 150)
+    poiszero_sim, _ = simulate_poiszero_hmm(30, 150, rng=rng)
     y_test = poiszero_sim["Y_t"]
 
-    with pm.Model() as test_model:
-        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+    with pm.Model(default_rng=rng) as test_model:
+        p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
         P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt, (y_test.shape[0],) + tuple(P_tt.shape))
+        )
 
-        pi_0_tt = compute_steady_state(P_rv)
+        pi_0_tt = compute_steady_state(P_tt)
 
-        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=y_test.shape[0])
+        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt)
 
         PoissonZeroProcess("Y_t", 9.0, S_rv, observed=y_test)
 
     with test_model:
         transmat = TransMatConjugateStep(P_rv)
 
-    test_point = test_model.test_point.copy()
-    test_point["S_t"] = (y_test > 0).astype(int)
+    initial_point = test_model.initial_point.copy()
+    initial_point["S_t"] = (y_test > 0).astype(int)
 
-    res = transmat.step(test_point)
+    res = transmat.step(initial_point)
 
     p_0_smpl = get_test_value(
         p_0_rv.distribution.transform.backward(res[p_0_rv.transformed.name])
@@ -266,13 +288,14 @@ def test_TransMatConjugateStep():
     assert np.allclose(sampled_trans_mat, true_trans_mat, atol=0.3)
 
 
+@pytest.mark.xfail(reason="Not refactored for v4, yet.")
 def test_TransMatConjugateStep_subtensors():
 
     # Confirm that Dirichlet/non-Dirichlet mixed rows can be
     # parsed
     with pm.Model():
-        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
         p_0_rv = at.as_tensor([0, 0, 1])
         p_1_rv = at.zeros(3)
@@ -281,8 +304,10 @@ def test_TransMatConjugateStep_subtensors():
         p_2_rv = at.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
 
         P_tt = at.stack([p_0_rv, p_1_rv, p_2_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
-        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10,))
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt, (10,) + tuple(P_tt.shape))
+        )
+        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0])
 
         transmat = TransMatConjugateStep(P_rv)
 
@@ -295,8 +320,8 @@ def test_TransMatConjugateStep_subtensors():
 
     # Same thing, just with some manipulations of the transition matrix
     with pm.Model():
-        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
         p_0_rv = at.as_tensor([0, 0, 1])
         p_1_rv = at.zeros(3)
@@ -307,8 +332,10 @@ def test_TransMatConjugateStep_subtensors():
         P_tt = at.horizontal_stack(
             p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]
         )
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt.T))
-        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10,))
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt.T, (10,) + tuple(P_tt.T.shape))
+        )
+        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0])
 
         transmat = TransMatConjugateStep(P_rv)
 
@@ -321,8 +348,8 @@ def test_TransMatConjugateStep_subtensors():
 
     # Use an observed `DiscreteMarkovChain` and check the conjugate results
     with pm.Model():
-        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
-        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
+        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
+        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])
 
         p_0_rv = at.as_tensor([0, 0, 1])
         p_1_rv = at.zeros(3)
@@ -333,9 +360,9 @@ def test_TransMatConjugateStep_subtensors():
         P_tt = at.horizontal_stack(
             p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]
         )
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt.T))
-        DiscreteMarkovChain(
-            "S_t", P_rv, np.r_[1, 0, 0], shape=(4,), observed=np.r_[0, 1, 0, 2]
+        P_rv = pm.Deterministic(
+            "P_tt", at.broadcast_to(P_tt.T, (4,) + tuple(P_tt.T.shape))
         )
+        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], observed=np.r_[0, 1, 0, 2])
 
         transmat = TransMatConjugateStep(P_rv)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d9b62dd..ef721a8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,20 +1,14 @@
+import aesara.tensor as at
 import numpy as np
 import pytest
 import scipy as sp
 
-try:
-    import aesara
-    import aesara.tensor as at
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
-
 from pymc3_hmm.utils import (
     compute_trans_freqs,
     logdotexp,
+    logsumexp,
     multilogit_inv,
-    tt_logdotexp,
-    tt_logsumexp,
+    np_logdotexp,
 )
 
 
@@ -40,62 +34,60 @@ def test_compute_trans_freqs():
 )
 def test_logsumexp(test_input):
     np_res = sp.special.logsumexp(test_input)
-    tt_res = tt_logsumexp(at.as_tensor_variable(test_input)).eval()
-    assert np.array_equal(np_res, tt_res)
+    at_res = logsumexp(at.as_tensor_variable(test_input)).eval()
+    assert np.array_equal(np_res, at_res)
 
 
-def test_logdotexp():
+def test_np_logdotexp():
     A = np.c_[[1.0, 2.0], [3.0, 4.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2], [30.0]].T
 
-    test_res = logdotexp(np.log(A), np.log(b))
+    test_res = np_logdotexp(np.log(A), np.log(b))
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2, 30.0]
-    test_res = logdotexp(np.log(A), np.log(b))
+    test_res = np_logdotexp(np.log(A), np.log(b))
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     A = np.c_[[1.0, 2.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2]].T
-    test_res = logdotexp(np.log(A), np.log(b))
+    test_res = np_logdotexp(np.log(A), np.log(b))
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2]
-    test_res = logdotexp(np.log(A), np.log(b))
+    test_res = np_logdotexp(np.log(A), np.log(b))
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
 
-def test_tt_logdotexp():
+def test_at_logdotexp():
 
     np.seterr(over="ignore", under="ignore")
 
-    aesara.config.compute_test_value = "warn"
-
     A = np.c_[[1.0, 2.0], [3.0, 4.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2], [30.0]].T
     A_tt = at.as_tensor_variable(A)
     b_tt = at.as_tensor_variable(b)
-    test_res = tt_logdotexp(at.log(A_tt), at.log(b_tt)).eval()
+    test_res = logdotexp(at.log(A_tt), at.log(b_tt)).eval()
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2, 30.0]
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = logdotexp(at.log(A), at.log(b)).eval()
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     A = np.c_[[1.0, 2.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2]].T
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = logdotexp(at.log(A), at.log(b)).eval()
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2]
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = logdotexp(at.log(A), at.log(b)).eval()
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
diff --git a/tests/utils.py b/tests/utils.py
index c339ba1..26d8e20 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,29 +1,34 @@
+import aesara.tensor as at
 import numpy as np
-
-try:
-    import aesara.tensor as at
-except ImportError:
-    import theano.tensor as at
-
 import pymc3 as pm
+from aesara.graph.basic import ancestors
+from aesara.tensor.random.op import RandomVariable
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, PoissonZeroProcess
 
 
+def assert_no_rvs(var):
+    assert not any(
+        isinstance(v.owner.op, RandomVariable) for v in ancestors([var]) if v.owner
+    )
+    return var
+
+
 def simulate_poiszero_hmm(
-    N, mu=10.0, pi_0_a=np.r_[1, 1], p_0_a=np.r_[5, 1], p_1_a=np.r_[1, 1]
+    N, mu=10.0, pi_0_a=np.r_[1, 1], p_0_a=np.r_[5, 1], p_1_a=np.r_[1, 1], rng=None
 ):
 
-    with pm.Model() as test_model:
-        p_0_rv = pm.Dirichlet("p_0", p_0_a, shape=np.shape(pi_0_a))
-        p_1_rv = pm.Dirichlet("p_1", p_1_a, shape=np.shape(pi_0_a))
+    with pm.Model(rng_seeder=rng) as test_model:
+        p_0_rv = pm.Dirichlet("p_0", p_0_a)
+        p_1_rv = pm.Dirichlet("p_1", p_1_a)
 
         P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = at.broadcast_to(P_tt, (N,) + tuple(P_tt.shape))
+        P_rv = pm.Deterministic("P_tt", P_tt)
 
-        pi_0_tt = pm.Dirichlet("pi_0", pi_0_a, shape=np.shape(pi_0_a))
+        pi_0_tt = pm.Dirichlet("pi_0", pi_0_a)
 
-        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt, shape=N)
+        S_rv = DiscreteMarkovChain("S_t", P_rv, pi_0_tt)
 
         PoissonZeroProcess("Y_t", mu, S_rv, observed=np.zeros(N))
 
@@ -31,8 +36,5 @@ def simulate_poiszero_hmm(
 
         # Remove the extra "sampling" dimension from the sample results
         sample_point = {k: v.squeeze(0) for k, v in sample_point.items()}
-        # Remove the extra dimension added due to `pm.sample_prior_predictive`
-        # forcing `size=1` in its call to `test_model.Y_t.random`.
-        sample_point["Y_t"] = sample_point["Y_t"].squeeze(0)
 
     return sample_point, test_model