diff --git a/benchmarks/benchmarks/benchmarks.py b/benchmarks/benchmarks/benchmarks.py
index 5035d7f08b..0fb1883906 100644
--- a/benchmarks/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks/benchmarks.py
@@ -25,23 +25,23 @@
 def glm_hierarchical_model(random_seed=123):
     """Sample glm hierarchical model to use in benchmarks"""
     np.random.seed(random_seed)
-    data = pd.read_csv(pm.get_data('radon.csv'))
-    data['log_radon'] = data['log_radon'].astype(theano.config.floatX)
+    data = pd.read_csv(pm.get_data("radon.csv"))
+    data["log_radon"] = data["log_radon"].astype(theano.config.floatX)
     county_idx = data.county_code.values
 
     n_counties = len(data.county.unique())
     with pm.Model() as model:
-        mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
-        sigma_a = pm.HalfCauchy('sigma_a', 5)
-        mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
-        sigma_b = pm.HalfCauchy('sigma_b', 5)
-        a = pm.Normal('a', mu=0, sd=1, shape=n_counties)
-        b = pm.Normal('b', mu=0, sd=1, shape=n_counties)
+        mu_a = pm.Normal("mu_a", mu=0.0, sd=100 ** 2)
+        sigma_a = pm.HalfCauchy("sigma_a", 5)
+        mu_b = pm.Normal("mu_b", mu=0.0, sd=100 ** 2)
+        sigma_b = pm.HalfCauchy("sigma_b", 5)
+        a = pm.Normal("a", mu=0, sd=1, shape=n_counties)
+        b = pm.Normal("b", mu=0, sd=1, shape=n_counties)
         a = mu_a + sigma_a * a
         b = mu_b + sigma_b * b
-        eps = pm.HalfCauchy('eps', 5)
+        eps = pm.HalfCauchy("eps", 5)
         radon_est = a[county_idx] + b[county_idx] * data.floor.values
-        pm.Normal('radon_like', mu=radon_est, sd=eps, observed=data.log_radon)
+        pm.Normal("radon_like", mu=radon_est, sd=eps, observed=data.log_radon)
     return model
 
 
@@ -50,24 +50,27 @@ def mixture_model(random_seed=1234):
     np.random.seed(1234)
     size = 1000
     w_true = np.array([0.35, 0.4, 0.25])
-    mu_true = np.array([0., 2., 5.])
-    sigma = np.array([0.5, 0.5, 1.])
+    mu_true = np.array([0.0, 2.0, 5.0])
+    sigma = np.array([0.5, 0.5, 1.0])
     component = np.random.choice(mu_true.size, size=size, p=w_true)
     x = np.random.normal(mu_true[component], sigma[component], size=size)
 
     with pm.Model() as model:
-        w = pm.Dirichlet('w', a=np.ones_like(w_true))
-        mu = pm.Normal('mu', mu=0., sd=10., shape=w_true.shape)
-        enforce_order = pm.Potential('enforce_order', tt.switch(mu[0] - mu[1] <= 0, 0., -np.inf) +
-                                                      tt.switch(mu[1] - mu[2] <= 0, 0., -np.inf))
-        tau = pm.Gamma('tau', alpha=1., beta=1., shape=w_true.shape)
-        pm.NormalMixture('x_obs', w=w, mu=mu, tau=tau, observed=x)
+        w = pm.Dirichlet("w", a=np.ones_like(w_true))
+        mu = pm.Normal("mu", mu=0.0, sd=10.0, shape=w_true.shape)
+        enforce_order = pm.Potential(
+            "enforce_order",
+            tt.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
+            + tt.switch(mu[1] - mu[2] <= 0, 0.0, -np.inf),
+        )
+        tau = pm.Gamma("tau", alpha=1.0, beta=1.0, shape=w_true.shape)
+        pm.NormalMixture("x_obs", w=w, mu=mu, tau=tau, observed=x)
 
     # Initialization can be poorly specified, this is a hack to make it work
     start = {
-        'mu': mu_true.copy(),
-        'tau_log__': np.log(1. / sigma**2),
-        'w_stickbreaking__': np.array([-0.03,  0.44])
+        "mu": mu_true.copy(),
+        "tau_log__": np.log(1.0 / sigma ** 2),
+        "w_stickbreaking__": np.array([-0.03, 0.44]),
     }
     return model, start
 
@@ -77,76 +80,175 @@ class OverheadSuite:
     Just tests how long sampling from a normal distribution takes for various
     samplers
     """
+
     params = [pm.NUTS, pm.HamiltonianMC, pm.Metropolis, pm.Slice]
     timer = timeit.default_timer
 
     def setup(self, step):
         self.n_steps = 10000
         with pm.Model() as self.model:
-            pm.Normal('x', mu=0, sd=1)
+            pm.Normal("x", mu=0, sd=1)
 
     def time_overhead_sample(self, step):
         with self.model:
-            pm.sample(self.n_steps, step=step(), random_seed=1,
-                      progressbar=False, compute_convergence_checks=False)
+            pm.sample(
+                self.n_steps,
+                step=step(),
+                random_seed=1,
+                progressbar=False,
+                compute_convergence_checks=False,
+            )
 
 
 class ExampleSuite:
     """Implements examples to keep up with benchmarking them."""
+
     timeout = 360.0  # give it a few minutes
     timer = timeit.default_timer
 
     def time_drug_evaluation(self):
-        drug = np.array([101, 100, 102, 104, 102, 97, 105, 105, 98, 101,
-                         100, 123, 105, 103, 100, 95, 102, 106, 109, 102, 82,
-                         102, 100, 102, 102, 101, 102, 102, 103, 103, 97, 97,
-                         103, 101, 97, 104, 96, 103, 124, 101, 101, 100, 101,
-                         101, 104, 100, 101])
-        placebo = np.array([99, 101, 100, 101, 102, 100, 97, 101, 104, 101,
-                            102, 102, 100, 105, 88, 101, 100, 104, 100, 100,
-                            100, 101, 102, 103, 97, 101, 101, 100, 101, 99,
-                            101, 100, 100, 101, 100, 99, 101, 100, 102, 99,
-                            100, 99])
-
-        y = pd.DataFrame({
-            'value': np.r_[drug, placebo],
-            'group': np.r_[['drug']*len(drug), ['placebo']*len(placebo)]
-            })
+        drug = np.array(
+            [
+                101,
+                100,
+                102,
+                104,
+                102,
+                97,
+                105,
+                105,
+                98,
+                101,
+                100,
+                123,
+                105,
+                103,
+                100,
+                95,
+                102,
+                106,
+                109,
+                102,
+                82,
+                102,
+                100,
+                102,
+                102,
+                101,
+                102,
+                102,
+                103,
+                103,
+                97,
+                97,
+                103,
+                101,
+                97,
+                104,
+                96,
+                103,
+                124,
+                101,
+                101,
+                100,
+                101,
+                101,
+                104,
+                100,
+                101,
+            ]
+        )
+        placebo = np.array(
+            [
+                99,
+                101,
+                100,
+                101,
+                102,
+                100,
+                97,
+                101,
+                104,
+                101,
+                102,
+                102,
+                100,
+                105,
+                88,
+                101,
+                100,
+                104,
+                100,
+                100,
+                100,
+                101,
+                102,
+                103,
+                97,
+                101,
+                101,
+                100,
+                101,
+                99,
+                101,
+                100,
+                100,
+                101,
+                100,
+                99,
+                101,
+                100,
+                102,
+                99,
+                100,
+                99,
+            ]
+        )
+
+        y = pd.DataFrame(
+            {
+                "value": np.r_[drug, placebo],
+                "group": np.r_[["drug"] * len(drug), ["placebo"] * len(placebo)],
+            }
+        )
         y_mean = y.value.mean()
         y_std = y.value.std() * 2
 
         sigma_low = 1
         sigma_high = 10
         with pm.Model():
-            group1_mean = pm.Normal('group1_mean', y_mean, sd=y_std)
-            group2_mean = pm.Normal('group2_mean', y_mean, sd=y_std)
-            group1_std = pm.Uniform('group1_std', lower=sigma_low, upper=sigma_high)
-            group2_std = pm.Uniform('group2_std', lower=sigma_low, upper=sigma_high)
-            lambda_1 = group1_std**-2
-            lambda_2 = group2_std**-2
-
-            nu = pm.Exponential('ν_minus_one', 1/29.) + 1
-
-            pm.StudentT('drug', nu=nu, mu=group1_mean, lam=lambda_1, observed=drug)
-            pm.StudentT('placebo', nu=nu, mu=group2_mean, lam=lambda_2, observed=placebo)
-            diff_of_means = pm.Deterministic('difference of means', group1_mean - group2_mean)
-            pm.Deterministic('difference of stds', group1_std - group2_std)
+            group1_mean = pm.Normal("group1_mean", y_mean, sd=y_std)
+            group2_mean = pm.Normal("group2_mean", y_mean, sd=y_std)
+            group1_std = pm.Uniform("group1_std", lower=sigma_low, upper=sigma_high)
+            group2_std = pm.Uniform("group2_std", lower=sigma_low, upper=sigma_high)
+            lambda_1 = group1_std ** -2
+            lambda_2 = group2_std ** -2
+
+            nu = pm.Exponential("ν_minus_one", 1 / 29.0) + 1
+
+            pm.StudentT("drug", nu=nu, mu=group1_mean, lam=lambda_1, observed=drug)
+            pm.StudentT("placebo", nu=nu, mu=group2_mean, lam=lambda_2, observed=placebo)
+            diff_of_means = pm.Deterministic("difference of means", group1_mean - group2_mean)
+            pm.Deterministic("difference of stds", group1_std - group2_std)
             pm.Deterministic(
-                'effect size', diff_of_means / np.sqrt((group1_std**2 + group2_std**2) / 2))
-            pm.sample(draws=20000, cores=4, chains=4,
-                      progressbar=False, compute_convergence_checks=False)
+                "effect size", diff_of_means / np.sqrt((group1_std ** 2 + group2_std ** 2) / 2)
+            )
+            pm.sample(
+                draws=20000, cores=4, chains=4, progressbar=False, compute_convergence_checks=False
+            )
 
     def time_glm_hierarchical(self):
         with glm_hierarchical_model():
-            pm.sample(draws=20000, cores=4, chains=4,
-                      progressbar=False, compute_convergence_checks=False)
+            pm.sample(
+                draws=20000, cores=4, chains=4, progressbar=False, compute_convergence_checks=False
+            )
 
 
 class NUTSInitSuite:
-    """Tests initializations for NUTS sampler on models
-    """
+    """Tests initializations for NUTS sampler on models"""
+
     timeout = 360.0
-    params = ('adapt_diag', 'jitter+adapt_diag', 'jitter+adapt_full', 'adapt_full')
+    params = ("adapt_diag", "jitter+adapt_diag", "jitter+adapt_full", "adapt_full")
     number = 1
     repeat = 1
     draws = 10000
@@ -159,32 +261,49 @@ def time_glm_hierarchical_init(self, init):
 
     def track_glm_hierarchical_ess(self, init):
         with glm_hierarchical_model():
-            start, step = pm.init_nuts(init=init, chains=self.chains, progressbar=False, random_seed=123)
+            start, step = pm.init_nuts(
+                init=init, chains=self.chains, progressbar=False, random_seed=123
+            )
             t0 = time.time()
-            trace = pm.sample(draws=self.draws, step=step, cores=4, chains=self.chains,
-                              start=start, random_seed=100, progressbar=False,
-                              compute_convergence_checks=False)
+            trace = pm.sample(
+                draws=self.draws,
+                step=step,
+                cores=4,
+                chains=self.chains,
+                start=start,
+                random_seed=100,
+                progressbar=False,
+                compute_convergence_checks=False,
+            )
             tot = time.time() - t0
-        ess = float(pm.ess(trace, var_names=['mu_a'])['mu_a'].values)
+        ess = float(pm.ess(trace, var_names=["mu_a"])["mu_a"].values)
         return ess / tot
 
     def track_marginal_mixture_model_ess(self, init):
         model, start = mixture_model()
         with model:
-            _, step = pm.init_nuts(init=init, chains=self.chains,
-                                   progressbar=False, random_seed=123)
+            _, step = pm.init_nuts(
+                init=init, chains=self.chains, progressbar=False, random_seed=123
+            )
             start = [{k: v for k, v in start.items()} for _ in range(self.chains)]
             t0 = time.time()
-            trace = pm.sample(draws=self.draws, step=step, cores=4, chains=self.chains,
-                              start=start, random_seed=100, progressbar=False,
-                              compute_convergence_checks=False)
+            trace = pm.sample(
+                draws=self.draws,
+                step=step,
+                cores=4,
+                chains=self.chains,
+                start=start,
+                random_seed=100,
+                progressbar=False,
+                compute_convergence_checks=False,
+            )
             tot = time.time() - t0
-        ess = pm.ess(trace, var_names=['mu'])['mu'].values.min()  # worst case
+        ess = pm.ess(trace, var_names=["mu"])["mu"].values.min()  # worst case
         return ess / tot
 
 
-NUTSInitSuite.track_glm_hierarchical_ess.unit = 'Effective samples per second'
-NUTSInitSuite.track_marginal_mixture_model_ess.unit = 'Effective samples per second'
+NUTSInitSuite.track_glm_hierarchical_ess.unit = "Effective samples per second"
+NUTSInitSuite.track_marginal_mixture_model_ess.unit = "Effective samples per second"
 
 
 class CompareMetropolisNUTSSuite:
@@ -200,15 +319,21 @@ def track_glm_hierarchical_ess(self, step):
             if step is not None:
                 step = step()
             t0 = time.time()
-            trace = pm.sample(draws=self.draws, step=step, cores=4, chains=4,
-                              random_seed=100, progressbar=False,
-                              compute_convergence_checks=False)
+            trace = pm.sample(
+                draws=self.draws,
+                step=step,
+                cores=4,
+                chains=4,
+                random_seed=100,
+                progressbar=False,
+                compute_convergence_checks=False,
+            )
             tot = time.time() - t0
-        ess = float(pm.ess(trace, var_names=['mu_a'])['mu_a'].values)
+        ess = float(pm.ess(trace, var_names=["mu_a"])["mu_a"].values)
         return ess / tot
 
 
-CompareMetropolisNUTSSuite.track_glm_hierarchical_ess.unit = 'Effective samples per second'
+CompareMetropolisNUTSSuite.track_glm_hierarchical_ess.unit = "Effective samples per second"
 
 
 class DifferentialEquationSuite:
@@ -223,30 +348,34 @@ def freefall(y, t, p):
 
         # Times for observation
         times = np.arange(0, 10, 0.5)
-        y = np.array([
-            -2.01,
-            9.49,
-            15.58,
-            16.57,
-            27.58,
-            32.26,
-            35.13,
-            38.07,
-            37.36,
-            38.83,
-            44.86,
-            43.58,
-            44.59,
-            42.75,
-            46.9,
-            49.32,
-            44.06,
-            49.86,
-            46.48,
-            48.18
-        ]).reshape(-1, 1)
-
-        ode_model = pm.ode.DifferentialEquation(func=freefall, times=times, n_states=1, n_theta=2, t0=0)
+        y = np.array(
+            [
+                -2.01,
+                9.49,
+                15.58,
+                16.57,
+                27.58,
+                32.26,
+                35.13,
+                38.07,
+                37.36,
+                38.83,
+                44.86,
+                43.58,
+                44.59,
+                42.75,
+                46.9,
+                49.32,
+                44.06,
+                49.86,
+                46.48,
+                48.18,
+            ]
+        ).reshape(-1, 1)
+
+        ode_model = pm.ode.DifferentialEquation(
+            func=freefall, times=times, n_states=1, n_theta=2, t0=0
+        )
         with pm.Model() as model:
             # Specify prior distributions for some of our model parameters
             sigma = pm.HalfCauchy("sigma", 1)
@@ -263,4 +392,4 @@ def freefall(y, t, p):
         return np.mean([ess.sigma, ess.gamma]) / tot
 
 
-DifferentialEquationSuite.track_1var_2par_ode_ess.unit = 'Effective samples per second'
+DifferentialEquationSuite.track_1var_2par_ode_ess.unit = "Effective samples per second"
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 67f82eebb0..2d3743c01b 100755
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -155,7 +155,7 @@
         ("Books + Videos", "learn"),
         ("API", "api"),
         ("Developer Guide", "developer_guide"),
-        ("About PyMC3", "about")
+        ("About PyMC3", "about"),
     ],
     #     "fixed_sidebar": "false",
     #     "description": "Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano"
@@ -264,9 +264,7 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, "pymc3.tex", "PyMC3 Documentation", "PyMC developers", "manual")
-]
+latex_documents = [(master_doc, "pymc3.tex", "PyMC3 Documentation", "PyMC developers", "manual")]
 
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
@@ -330,7 +328,5 @@
 
 
 def setup(app):
-    app.add_css_file(
-        "https://cdn.jsdelivr.net/npm/semantic-ui@2.4.2/dist/semantic.min.css"
-    )
+    app.add_css_file("https://cdn.jsdelivr.net/npm/semantic-ui@2.4.2/dist/semantic.min.css")
     app.add_css_file("default.css")
diff --git a/docs/source/sphinxext/gallery_generator.py b/docs/source/sphinxext/gallery_generator.py
index 2e9c716a8f..b3fdd23637 100644
--- a/docs/source/sphinxext/gallery_generator.py
+++ b/docs/source/sphinxext/gallery_generator.py
@@ -18,9 +18,7 @@
 from matplotlib import image
 
 DOC_SRC = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-DEFAULT_IMG_LOC = os.path.join(
-            os.path.dirname(DOC_SRC), "logos", "PyMC3.png"
-        )
+DEFAULT_IMG_LOC = os.path.join(os.path.dirname(DOC_SRC), "logos", "PyMC3.png")
 TABLE_OF_CONTENTS_FILENAME = "table_of_contents_{}.js"
 
 INDEX_TEMPLATE = """
@@ -69,13 +67,9 @@ class NotebookGenerator:
     def __init__(self, filename, target_dir):
         self.basename = os.path.basename(filename)
         self.stripped_name = os.path.splitext(self.basename)[0]
-        self.output_html = os.path.join(
-            "..", "notebooks", f"{self.stripped_name}.html"
-        )
+        self.output_html = os.path.join("..", "notebooks", f"{self.stripped_name}.html")
         self.image_dir = os.path.join(target_dir, "_images")
-        self.png_path = os.path.join(
-            self.image_dir, f"{self.stripped_name}.png"
-        )
+        self.png_path = os.path.join(self.image_dir, f"{self.stripped_name}.png")
         with open(filename) as fid:
             self.json_source = json.load(fid)
         self.pagetitle = self.extract_title()
@@ -141,8 +135,7 @@ def build_gallery(srcdir, gallery):
     source_dir = os.path.abspath(
         os.path.join(os.path.dirname(os.path.dirname(srcdir)), "notebooks")
     )
-    table_of_contents_file = os.path.join(
-        source_dir, TABLE_OF_CONTENTS_FILENAME.format(gallery))
+    table_of_contents_file = os.path.join(source_dir, TABLE_OF_CONTENTS_FILENAME.format(gallery))
     tocjs = TableOfContentsJS()
     tocjs.load(table_of_contents_file)
 
@@ -157,7 +150,7 @@ def build_gallery(srcdir, gallery):
 
     if not os.path.exists(source_dir):
         os.makedirs(source_dir)
-    
+
     # Create default image
     default_png_path = os.path.join(os.path.join(target_dir, "_images"), "default.png")
     shutil.copy(DEFAULT_IMG_LOC, default_png_path)
@@ -178,7 +171,7 @@ def build_gallery(srcdir, gallery):
             filename = basename.split(".")[0]
             data[basename] = {
                 "title": " ".join(filename.split("_")),
-                "url": os.path.join(os.sep, gallery, "../"+filename+".html"),
+                "url": os.path.join(os.sep, gallery, "../" + filename + ".html"),
                 "thumb": os.path.basename(default_png_path),
             }
 
@@ -186,19 +179,17 @@ def build_gallery(srcdir, gallery):
     with open(table_of_contents_file) as toc:
         table_of_contents = toc.read()
 
-    js_contents = "Gallery.examples = {}\n{}".format(
-        json.dumps(data), table_of_contents
-    )
+    js_contents = "Gallery.examples = {}\n{}".format(json.dumps(data), table_of_contents)
 
     with open(js_file, "w") as js:
         js.write(js_contents)
 
     with open(os.path.join(target_dir, "index.rst"), "w") as index:
-        index.write(INDEX_TEMPLATE.format(
-            sphinx_tag="notebook_gallery",
-            gallery=gallery,
-            Gallery=gallery.title().rstrip("s")
-        ))
+        index.write(
+            INDEX_TEMPLATE.format(
+                sphinx_tag="notebook_gallery", gallery=gallery, Gallery=gallery.title().rstrip("s")
+            )
+        )
 
     os.chdir(working_dir)
 
diff --git a/pymc3/gp/cov.py b/pymc3/gp/cov.py
index df8ff24fd4..c23b18817d 100644
--- a/pymc3/gp/cov.py
+++ b/pymc3/gp/cov.py
@@ -87,10 +87,13 @@ def full(self, X, Xs):
 
     def _slice(self, X, Xs):
         if self.input_dim != X.shape[-1]:
-            warnings.warn(f"Only {self.input_dim} column(s) out of {X.shape[-1]} are"
-                          " being used to compute the covariance function. If this"
-                          " is not intended, increase 'input_dim' parameter to"
-                          " the number of columns to use. Ignore otherwise.", UserWarning)
+            warnings.warn(
+                f"Only {self.input_dim} column(s) out of {X.shape[-1]} are"
+                " being used to compute the covariance function. If this"
+                " is not intended, increase 'input_dim' parameter to"
+                " the number of columns to use. Ignore otherwise.",
+                UserWarning,
+            )
         X = tt.as_tensor_variable(X[:, self.active_dims])
         if Xs is not None:
             Xs = tt.as_tensor_variable(Xs[:, self.active_dims])
@@ -109,9 +112,9 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     def __pow__(self, other):
-        if(
-            isinstance(other, theano.compile.SharedVariable) and
-            other.get_value().squeeze().shape == ()
+        if (
+            isinstance(other, theano.compile.SharedVariable)
+            and other.get_value().squeeze().shape == ()
         ):
             other = tt.squeeze(other)
             return Exponentiated(self, other)
@@ -123,7 +126,6 @@ def __pow__(self, other):
 
         raise ValueError("A covariance function can only be exponentiated by a scalar value")
 
-
     def __array_wrap__(self, result):
         """
         Required to allow radd/rmul by numpy arrays.
@@ -132,7 +134,9 @@ def __array_wrap__(self, result):
         if len(result.shape) <= 1:
             result = result.reshape(1, 1)
         elif len(result.shape) > 2:
-            raise ValueError(f"cannot combine a covariance function with array of shape {result.shape}")
+            raise ValueError(
+                f"cannot combine a covariance function with array of shape {result.shape}"
+            )
         r, c = result.shape
         A = np.zeros((r, c))
         for i in range(r):
@@ -149,11 +153,7 @@ def __array_wrap__(self, result):
 class Combination(Covariance):
     def __init__(self, factor_list):
         input_dim = max(
-            [
-                factor.input_dim
-                for factor in factor_list
-                if isinstance(factor, Covariance)
-            ]
+            [factor.input_dim for factor in factor_list if isinstance(factor, Covariance)]
         )
         super().__init__(input_dim=input_dim)
         self.factor_list = []
@@ -205,10 +205,7 @@ class Exponentiated(Covariance):
     def __init__(self, kernel, power):
         self.kernel = kernel
         self.power = power
-        super().__init__(
-            input_dim=self.kernel.input_dim,
-            active_dims=self.kernel.active_dims
-        )
+        super().__init__(input_dim=self.kernel.input_dim, active_dims=self.kernel.active_dims)
 
     def __call__(self, X, Xs=None, diag=False):
         return self.kernel(X, Xs, diag=diag) ** self.power
@@ -247,9 +244,7 @@ def _split(self, X, Xs):
 
     def __call__(self, X, Xs=None, diag=False):
         X_split, Xs_split = self._split(X, Xs)
-        covs = [
-            cov(x, xs, diag) for cov, x, xs in zip(self.factor_list, X_split, Xs_split)
-        ]
+        covs = [cov(x, xs, diag) for cov, x, xs in zip(self.factor_list, X_split, Xs_split)]
         return reduce(mul, covs)
 
 
@@ -431,9 +426,7 @@ class Matern52(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * tt.square(r)) * tt.exp(
-            -1.0 * np.sqrt(5.0) * r
-        )
+        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * tt.square(r)) * tt.exp(-1.0 * np.sqrt(5.0) * r)
 
 
 class Matern32(Stationary):
@@ -605,14 +598,10 @@ def __init__(self, input_dim, lengthscale_func, args=None, active_dims=None):
         super().__init__(input_dim, active_dims)
         if active_dims is not None:
             if len(active_dims) > 1:
-                raise NotImplementedError(
-                    ("Higher dimensional inputs ", "are untested")
-                )
+                raise NotImplementedError(("Higher dimensional inputs ", "are untested"))
         else:
             if input_dim != 1:
-                raise NotImplementedError(
-                    ("Higher dimensional inputs ", "are untested")
-                )
+                raise NotImplementedError(("Higher dimensional inputs ", "are untested"))
         if not callable(lengthscale_func):
             raise TypeError("lengthscale_func must be callable")
         self.lfunc = handle_args(lengthscale_func, args)
@@ -642,9 +631,7 @@ def full(self, X, Xs=None):
             r2 = self.square_dist(X, Xs)
         rx2 = tt.reshape(tt.square(rx), (-1, 1))
         rz2 = tt.reshape(tt.square(rz), (1, -1))
-        return tt.sqrt((2.0 * tt.outer(rx, rz)) / (rx2 + rz2)) * tt.exp(
-            -1.0 * r2 / (rx2 + rz2)
-        )
+        return tt.sqrt((2.0 * tt.outer(rx, rz)) / (rx2 + rz2)) * tt.exp(-1.0 * r2 / (rx2 + rz2))
 
     def diag(self, X):
         return tt.alloc(1.0, X.shape[0])
@@ -734,9 +721,7 @@ def __init__(self, input_dim, W=None, kappa=None, B=None, active_dims=None):
             raise ValueError("Coregion requires exactly one dimension to be active")
         make_B = W is not None or kappa is not None
         if make_B and B is not None:
-            raise ValueError(
-                "Exactly one of (W, kappa) and B must be provided to Coregion"
-            )
+            raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
         if make_B:
             self.W = tt.as_tensor_variable(W)
             self.kappa = tt.as_tensor_variable(kappa)
@@ -744,9 +729,7 @@ def __init__(self, input_dim, W=None, kappa=None, B=None, active_dims=None):
         elif B is not None:
             self.B = tt.as_tensor_variable(B)
         else:
-            raise ValueError(
-                "Exactly one of (W, kappa) and B must be provided to Coregion"
-            )
+            raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
diff --git a/pymc3/gp/mean.py b/pymc3/gp/mean.py
index 33b1f7acfd..d2e93fdfe5 100644
--- a/pymc3/gp/mean.py
+++ b/pymc3/gp/mean.py
@@ -14,7 +14,7 @@
 
 import theano.tensor as tt
 
-__all__ = ['Zero', 'Constant', 'Linear']
+__all__ = ["Zero", "Constant", "Linear"]
 
 
 class Mean:
@@ -48,6 +48,7 @@ class Zero(Mean):
     def __call__(self, X):
         return tt.alloc(0.0, X.shape[0])
 
+
 class Constant(Mean):
     R"""
     Constant mean function for Gaussian process.
diff --git a/pymc3/gp/util.py b/pymc3/gp/util.py
index cc2c73d8df..b35ffeac48 100644
--- a/pymc3/gp/util.py
+++ b/pymc3/gp/util.py
@@ -18,9 +18,9 @@
 import warnings
 
 cholesky = tt.slinalg.cholesky
-solve_lower = tt.slinalg.Solve(A_structure='lower_triangular')
-solve_upper = tt.slinalg.Solve(A_structure='upper_triangular')
-solve = tt.slinalg.Solve(A_structure='general')
+solve_lower = tt.slinalg.Solve(A_structure="lower_triangular")
+solve_upper = tt.slinalg.Solve(A_structure="upper_triangular")
+solve = tt.slinalg.Solve(A_structure="general")
 
 
 def infer_shape(X, n_points=None):
@@ -44,10 +44,12 @@ def kmeans_inducing_points(n_inducing, X):
     elif isinstance(X, (np.ndarray, tuple, list)):
         X = np.asarray(X)
     else:
-        raise TypeError("To use K-means initialization, "
-                         "please provide X as a type that "
-                         "can be cast to np.ndarray, instead "
-                         "of {}".format(type(X)))
+        raise TypeError(
+            "To use K-means initialization, "
+            "please provide X as a type that "
+            "can be cast to np.ndarray, instead "
+            "of {}".format(type(X))
+        )
     scaling = np.std(X, 0)
     # if std of a column is very small (zero), don't normalize that column
     scaling[scaling <= 1e-6] = 1.0
@@ -58,35 +60,51 @@ def kmeans_inducing_points(n_inducing, X):
 
 def conditioned_vars(varnames):
     """ Decorator for validating attrs that are conditioned on. """
+
     def gp_wrapper(cls):
         def make_getter(name):
             def getter(self):
                 value = getattr(self, name, None)
                 if value is None:
-                    raise AttributeError("'{}' not set.  Provide as argument "
-                                          "to condition, or call 'prior' "
-                                          "first".format(name.lstrip("_")))
+                    raise AttributeError(
+                        "'{}' not set.  Provide as argument "
+                        "to condition, or call 'prior' "
+                        "first".format(name.lstrip("_"))
+                    )
                 else:
                     return value
                 return getattr(self, name)
+
             return getter
 
         def make_setter(name):
             def setter(self, val):
                 setattr(self, name, val)
+
             return setter
 
         for name in varnames:
-            getter = make_getter('_' + name)
-            setter = make_setter('_' + name)
+            getter = make_getter("_" + name)
+            setter = make_setter("_" + name)
             setattr(cls, name, property(getter, setter))
         return cls
+
     return gp_wrapper
 
 
-def plot_gp_dist(ax, samples:np.ndarray, x:np.ndarray, plot_samples=True, palette="Reds", fill_alpha=0.8, samples_alpha=0.1, fill_kwargs=None, samples_kwargs=None):
-    """ A helper function for plotting 1D GP posteriors from trace 
-    
+def plot_gp_dist(
+    ax,
+    samples: np.ndarray,
+    x: np.ndarray,
+    plot_samples=True,
+    palette="Reds",
+    fill_alpha=0.8,
+    samples_alpha=0.1,
+    fill_kwargs=None,
+    samples_kwargs=None,
+):
+    """A helper function for plotting 1D GP posteriors from trace
+
         Parameters
     ----------
     ax: axes
@@ -95,7 +113,7 @@ def plot_gp_dist(ax, samples:np.ndarray, x:np.ndarray, plot_samples=True, palett
         Array of S posterior predictive sample from a GP.
         Expected shape: (S, X)
     x: numpy.ndarray
-        Grid of X values corresponding to the samples. 
+        Grid of X values corresponding to the samples.
         Expected shape: (X,) or (X, 1), or (1, X)
     plot_samples: bool
         Plot the GP samples along with posterior (defaults True).
@@ -123,9 +141,9 @@ def plot_gp_dist(ax, samples:np.ndarray, x:np.ndarray, plot_samples=True, palett
         samples_kwargs = {}
     if np.any(np.isnan(samples)):
         warnings.warn(
-            'There are `nan` entries in the [samples] arguments. '
-            'The plot will not contain a band!',
-            UserWarning
+            "There are `nan` entries in the [samples] arguments. "
+            "The plot will not contain a band!",
+            UserWarning,
         )
 
     cmap = plt.get_cmap(palette)
@@ -135,13 +153,12 @@ def plot_gp_dist(ax, samples:np.ndarray, x:np.ndarray, plot_samples=True, palett
     x = x.flatten()
     for i, p in enumerate(percs[::-1]):
         upper = np.percentile(samples, p, axis=1)
-        lower = np.percentile(samples, 100-p, axis=1)
+        lower = np.percentile(samples, 100 - p, axis=1)
         color_val = colors[i]
         ax.fill_between(x, upper, lower, color=cmap(color_val), alpha=fill_alpha, **fill_kwargs)
     if plot_samples:
         # plot a few samples
         idx = np.random.randint(0, samples.shape[1], 30)
-        ax.plot(x, samples[:,idx], color=cmap(0.9), lw=1, alpha=samples_alpha,
-                **samples_kwargs)
+        ax.plot(x, samples[:, idx], color=cmap(0.9), lw=1, alpha=samples_alpha, **samples_kwargs)
 
     return ax
diff --git a/pymc3/math.py b/pymc3/math.py
index 770dbeebe2..dba3f938a6 100644
--- a/pymc3/math.py
+++ b/pymc3/math.py
@@ -382,9 +382,7 @@ class BlockDiagonalMatrix(Op):
 
     def __init__(self, sparse=False, format="csr"):
         if format not in ("csr", "csc"):
-            raise ValueError(
-                f"format must be one of: 'csr', 'csc', got {format}"
-            )
+            raise ValueError(f"format must be one of: 'csr', 'csc', got {format}")
         self.sparse = sparse
         self.format = format
 
@@ -395,9 +393,7 @@ def make_node(self, *matrices):
         if any(mat.type.ndim != 2 for mat in matrices):
             raise TypeError("all data arguments must be matrices")
         if self.sparse:
-            out_type = theano.sparse.matrix(
-                self.format, dtype=largest_common_dtype(matrices)
-            )
+            out_type = theano.sparse.matrix(self.format, dtype=largest_common_dtype(matrices))
         else:
             out_type = theano.tensor.matrix(dtype=largest_common_dtype(matrices))
         return tt.Apply(self, matrices, [out_type])
diff --git a/pymc3/memoize.py b/pymc3/memoize.py
index b43bca5b45..349d03188f 100644
--- a/pymc3/memoize.py
+++ b/pymc3/memoize.py
@@ -16,6 +16,7 @@
 import pickle
 import collections
 from .util import biwrap
+
 CACHE_REGISTRY = []
 
 
@@ -37,14 +38,15 @@ def memoizer(*args, **kwargs):
         else:
             # bound methods have self as first argument, remove it to compute key
             key = (hashable(args[1:]), hashable(kwargs))
-            if not hasattr(args[0], '_cache'):
-                setattr(args[0], '_cache', collections.defaultdict(dict))
+            if not hasattr(args[0], "_cache"):
+                setattr(args[0], "_cache", collections.defaultdict(dict))
                 # do not add to cache regestry
-            cache = getattr(args[0], '_cache')[obj.__name__]
+            cache = getattr(args[0], "_cache")[obj.__name__]
         if key not in cache:
             cache[key] = obj(*args, **kwargs)
 
         return cache[key]
+
     return memoizer
 
 
@@ -54,7 +56,7 @@ def clear_cache(obj=None):
             c.clear()
     else:
         if isinstance(obj, WithMemoization):
-            for v in getattr(obj, '_cache', {}).values():
+            for v in getattr(obj, "_cache", {}).values():
                 v.clear()
         else:
             obj.cache.clear()
@@ -66,7 +68,7 @@ def __hash__(self):
 
     def __getstate__(self):
         state = self.__dict__.copy()
-        state.pop('_cache', None)
+        state.pop("_cache", None)
         return state
 
     def __setstate__(self, state):
@@ -87,7 +89,7 @@ def hashable(a):
     try:
         return hash(pickle.dumps(a))
     except Exception:
-        if hasattr(a, '__dict__'):
+        if hasattr(a, "__dict__"):
             return hashable(a.__dict__)
         else:
             return id(a)
diff --git a/pymc3/model_graph.py b/pymc3/model_graph.py
index 93fc3a8aab..adede230ed 100644
--- a/pymc3/model_graph.py
+++ b/pymc3/model_graph.py
@@ -31,28 +31,30 @@ def __init__(self, model):
         self.model = model
         self.var_names = get_default_varnames(self.model.named_vars, include_transformed=False)
         self.var_list = self.model.named_vars.values()
-        self.transform_map = {v.transformed: v.name for v in self.var_list if hasattr(v, 'transformed')}
+        self.transform_map = {
+            v.transformed: v.name for v in self.var_list if hasattr(v, "transformed")
+        }
         self._deterministics = None
 
     def get_deterministics(self, var):
         """Compute the deterministic nodes of the graph, **not** including var itself."""
         deterministics = []
-        attrs = ('transformed', 'logpt')
+        attrs = ("transformed", "logpt")
         for v in self.var_list:
             if v != var and all(not hasattr(v, attr) for attr in attrs):
                 deterministics.append(v)
         return deterministics
 
     def _get_ancestors(self, var: Tensor, func) -> Set[Tensor]:
-        """Get all ancestors of a function, doing some accounting for deterministics.
-        """
+        """Get all ancestors of a function, doing some accounting for deterministics."""
 
         # this contains all of the variables in the model EXCEPT var...
         vars = set(self.var_list)
         vars.remove(var)
 
-        blockers = set() # type: Set[Tensor]
+        blockers = set()  # type: Set[Tensor]
         retval = set()  # type: Set[Tensor]
+
         def _expand(node) -> Optional[Iterator[Tensor]]:
             if node in blockers:
                 return None
@@ -66,14 +68,12 @@ def _expand(node) -> Optional[Iterator[Tensor]]:
             else:
                 return None
 
-        stack_search(start = deque([func]),
-                     expand=_expand,
-                     mode='bfs')
+        stack_search(start=deque([func]), expand=_expand, mode="bfs")
         return retval
 
     def _filter_parents(self, var, parents) -> Set[VarName]:
         """Get direct parents of a var, as strings"""
-        keep = set() # type: Set[VarName]
+        keep = set()  # type: Set[VarName]
         for p in parents:
             if p == var:
                 continue
@@ -83,14 +83,14 @@ def _filter_parents(self, var, parents) -> Set[VarName]:
                 if self.transform_map[p] != var.name:
                     keep.add(self.transform_map[p])
             else:
-                raise AssertionError('Do not know what to do with {}'.format(get_var_name(p)))
+                raise AssertionError("Do not know what to do with {}".format(get_var_name(p)))
         return keep
 
     def get_parents(self, var: Tensor) -> Set[VarName]:
         """Get the named nodes that are direct inputs to the var"""
-        if hasattr(var, 'transformed'):
+        if hasattr(var, "transformed"):
             func = var.transformed.logpt
-        elif hasattr(var, 'logpt'):
+        elif hasattr(var, "logpt"):
             func = var.logpt
         else:
             func = var
@@ -100,7 +100,8 @@ def get_parents(self, var: Tensor) -> Set[VarName]:
 
     def make_compute_graph(self) -> Dict[str, Set[VarName]]:
         """Get map of var_name -> set(input var names) for the model"""
-        input_map = {} # type: Dict[str, Set[VarName]]
+        input_map = {}  # type: Dict[str, Set[VarName]]
+
         def update_input_map(key: str, val: Set[VarName]):
             if key in input_map:
                 input_map[key] = input_map[key].union(val)
@@ -127,31 +128,29 @@ def _make_node(self, var_name, graph):
         # styling for node
         attrs = {}
         if isinstance(v, pm.model.ObservedRV):
-            attrs['style'] = 'filled'
+            attrs["style"] = "filled"
 
         # make Data be roundtangle, instead of rectangle
         if isinstance(v, SharedVariable):
-            attrs['style'] = 'rounded, filled'
+            attrs["style"] = "rounded, filled"
 
         # Get name for node
         if v in self.model.potentials:
-            distribution = 'Potential'
-            attrs['shape'] = 'octagon'
-        elif hasattr(v, 'distribution'):
+            distribution = "Potential"
+            attrs["shape"] = "octagon"
+        elif hasattr(v, "distribution"):
             distribution = v.distribution.__class__.__name__
         elif isinstance(v, SharedVariable):
-            distribution = 'Data'
-            attrs['shape'] = 'box'
+            distribution = "Data"
+            attrs["shape"] = "box"
         else:
-            distribution = 'Deterministic'
-            attrs['shape'] = 'box'
+            distribution = "Deterministic"
+            attrs["shape"] = "box"
 
-        graph.node(var_name.replace(':', '&'),
-                f'{var_name}\n~\n{distribution}',
-                **attrs)
+        graph.node(var_name.replace(":", "&"), f"{var_name}\n~\n{distribution}", **attrs)
 
     def get_plates(self):
-        """ Rough but surprisingly accurate plate detection.
+        """Rough but surprisingly accurate plate detection.
 
         Just groups by the shape of the underlying distribution.  Will be wrong
         if there are two plates with the same shape.
@@ -163,14 +162,14 @@ def get_plates(self):
         plates = {}
         for var_name in self.var_names:
             v = self.model[var_name]
-            if hasattr(v, 'observations'):
+            if hasattr(v, "observations"):
                 try:
                     # To get shape of _observed_ data container `pm.Data`
                     # (wrapper for theano.SharedVariable) we evaluate it.
                     shape = tuple(v.observations.shape.eval())
                 except AttributeError:
                     shape = v.observations.shape
-            elif hasattr(v, 'dshape'):
+            elif hasattr(v, "dshape"):
                 shape = v.dshape
             else:
                 shape = v.tag.test_value.shape
@@ -191,28 +190,30 @@ def make_graph(self):
         try:
             import graphviz
         except ImportError:
-            raise ImportError('This function requires the python library graphviz, along with binaries. '
-                              'The easiest way to install all of this is by running\n\n'
-                              '\tconda install -c conda-forge python-graphviz')
+            raise ImportError(
+                "This function requires the python library graphviz, along with binaries. "
+                "The easiest way to install all of this is by running\n\n"
+                "\tconda install -c conda-forge python-graphviz"
+            )
         graph = graphviz.Digraph(self.model.name)
         for shape, var_names in self.get_plates().items():
             if isinstance(shape, SharedVariable):
                 shape = shape.eval()
-            label = ' x '.join(map('{:,d}'.format, shape))
+            label = " x ".join(map("{:,d}".format, shape))
             if label:
                 # must be preceded by 'cluster' to get a box around it
-                with graph.subgraph(name='cluster' + label) as sub:
+                with graph.subgraph(name="cluster" + label) as sub:
                     for var_name in var_names:
                         self._make_node(var_name, sub)
                     # plate label goes bottom right
-                    sub.attr(label=label, labeljust='r', labelloc='b', style='rounded')
+                    sub.attr(label=label, labeljust="r", labelloc="b", style="rounded")
             else:
                 for var_name in var_names:
                     self._make_node(var_name, graph)
 
         for key, values in self.make_compute_graph().items():
             for value in values:
-                graph.edge(value.replace(':', '&'), key.replace(':', '&'))
+                graph.edge(value.replace(":", "&"), key.replace(":", "&"))
         return graph
 
 
diff --git a/pymc3/ode/ode.py b/pymc3/ode/ode.py
index 21074e4822..da283ff39c 100644
--- a/pymc3/ode/ode.py
+++ b/pymc3/ode/ode.py
@@ -20,7 +20,7 @@
 from ..ode import utils
 from ..exceptions import ShapeError, DtypeError
 
-_log = logging.getLogger('pymc3')
+_log = logging.getLogger("pymc3")
 floatX = theano.config.floatX
 
 
@@ -48,24 +48,26 @@ class DifferentialEquation(theano.Op):
 
     Examples
     --------
-    
+
     .. code-block:: python
 
         def odefunc(y, t, p):
             #Logistic differential equation
             return p[0] * y[0] * (1 - y[0])
-       
+
         times = np.arange(0.5, 5, 0.5)
 
         ode_model = DifferentialEquation(func=odefunc, times=times, n_states=1, n_theta=1, t0=0)
     """
     _itypes = [
-        tt.TensorType(floatX, (False,)),                  # y0 as 1D floatX vector
-        tt.TensorType(floatX, (False,))                   # theta as 1D floatX vector
+        tt.TensorType(floatX, (False,)),  # y0 as 1D floatX vector
+        tt.TensorType(floatX, (False,)),  # theta as 1D floatX vector
     ]
     _otypes = [
-        tt.TensorType(floatX, (False, False)),            # model states as floatX of shape (T, S)
-        tt.TensorType(floatX, (False, False, False)),     # sensitivities as floatX of shape (T, S, len(y0) + len(theta))
+        tt.TensorType(floatX, (False, False)),  # model states as floatX of shape (T, S)
+        tt.TensorType(
+            floatX, (False, False, False)
+        ),  # sensitivities as floatX of shape (T, S, len(y0) + len(theta))
     ]
     __props__ = ("func", "times", "n_states", "n_theta", "t0")
 
@@ -107,7 +109,7 @@ def _system(self, Y, t, p):
         p : array
             parameter vector (y0, theta)
         """
-        dydt, ddt_dydp = self._augmented_func(Y[:self.n_states], t, p, Y[self.n_states:])
+        dydt, ddt_dydp = self._augmented_func(Y[: self.n_states], t, p, Y[self.n_states :])
         derivatives = np.concatenate([dydt, ddt_dydp])
         return derivatives
 
@@ -120,16 +122,16 @@ def _simulate(self, y0, theta):
             func=self._system, y0=s0, t=self._augmented_times, args=(np.concatenate([y0, theta]),)
         ).astype(floatX)
         # The solution
-        y = sol[1:, :self.n_states]
+        y = sol[1:, : self.n_states]
 
         # The sensitivities, reshaped to be a sequence of matrices
-        sens = sol[1:, self.n_states:].reshape(self.n_times, self.n_states, self.n_p)
+        sens = sol[1:, self.n_states :].reshape(self.n_times, self.n_states, self.n_p)
 
         return y, sens
 
     def make_node(self, y0, theta):
         inputs = (y0, theta)
-        _log.debug('make_node for inputs {}'.format(hash(inputs)))
+        _log.debug("make_node for inputs {}".format(hash(inputs)))
         states = self._otypes[0]()
         sens = self._otypes[1]()
 
@@ -139,46 +141,65 @@ def make_node(self, y0, theta):
 
     def __call__(self, y0, theta, return_sens=False, **kwargs):
         if isinstance(y0, (list, tuple)) and not len(y0) == self.n_states:
-            raise ShapeError('Length of y0 is wrong.', actual=(len(y0),), expected=(self.n_states,))
+            raise ShapeError("Length of y0 is wrong.", actual=(len(y0),), expected=(self.n_states,))
         if isinstance(theta, (list, tuple)) and not len(theta) == self.n_theta:
-            raise ShapeError('Length of theta is wrong.', actual=(len(theta),), expected=(self.n_theta,))
-            
+            raise ShapeError(
+                "Length of theta is wrong.", actual=(len(theta),), expected=(self.n_theta,)
+            )
+
         # convert inputs to tensors (and check their types)
         y0 = tt.cast(tt.unbroadcast(tt.as_tensor_variable(y0), 0), floatX)
         theta = tt.cast(tt.unbroadcast(tt.as_tensor_variable(theta), 0), floatX)
         inputs = [y0, theta]
         for i, (input_val, itype) in enumerate(zip(inputs, self._itypes)):
             if not input_val.type == itype:
-                raise ValueError(f'Input {i} of type {input_val.type} does not have the expected type of {itype}')
-        
+                raise ValueError(
+                    f"Input {i} of type {input_val.type} does not have the expected type of {itype}"
+                )
+
         # use default implementation to prepare symbolic outputs (via make_node)
         states, sens = super(theano.Op, self).__call__(y0, theta, **kwargs)
 
-        if theano.config.compute_test_value != 'off':
+        if theano.config.compute_test_value != "off":
             # compute test values from input test values
             test_states, test_sens = self._simulate(
-                y0=self._get_test_value(y0),
-                theta=self._get_test_value(theta)
+                y0=self._get_test_value(y0), theta=self._get_test_value(theta)
             )
 
             # check types of simulation result
             if not test_states.dtype == self._otypes[0].dtype:
-                raise DtypeError('Simulated states have the wrong type.', actual=test_states.dtype, expected=self._otypes[0].dtype)
+                raise DtypeError(
+                    "Simulated states have the wrong type.",
+                    actual=test_states.dtype,
+                    expected=self._otypes[0].dtype,
+                )
             if not test_sens.dtype == self._otypes[1].dtype:
-                raise DtypeError('Simulated sensitivities have the wrong type.', actual=test_sens.dtype, expected=self._otypes[1].dtype)
+                raise DtypeError(
+                    "Simulated sensitivities have the wrong type.",
+                    actual=test_sens.dtype,
+                    expected=self._otypes[1].dtype,
+                )
 
             # check shapes of simulation result
             expected_states_shape = (self.n_times, self.n_states)
             expected_sens_shape = (self.n_times, self.n_states, self.n_p)
             if not test_states.shape == expected_states_shape:
-                raise ShapeError('Simulated states have the wrong shape.', test_states.shape, expected_states_shape)
+                raise ShapeError(
+                    "Simulated states have the wrong shape.",
+                    test_states.shape,
+                    expected_states_shape,
+                )
             if not test_sens.shape == expected_sens_shape:
-                raise ShapeError('Simulated sensitivities have the wrong shape.', test_sens.shape, expected_sens_shape)
+                raise ShapeError(
+                    "Simulated sensitivities have the wrong shape.",
+                    test_sens.shape,
+                    expected_sens_shape,
+                )
 
             # attach results as test values to the outputs
             states.tag.test_value = test_states
             sens.tag.test_value = test_sens
-        
+
         if return_sens:
             return states, sens
         return states
@@ -194,25 +215,22 @@ def infer_shape(self, node, input_shapes):
         return output_shapes
 
     def grad(self, inputs, output_grads):
-        _log.debug('grad w.r.t. inputs {}'.format(hash(tuple(inputs))))
-        
+        _log.debug("grad w.r.t. inputs {}".format(hash(tuple(inputs))))
+
         # fetch symbolic sensitivity output node from cache
         ihash = hash(tuple(inputs))
         if ihash in self._output_sensitivities:
             sens = self._output_sensitivities[ihash]
         else:
-            _log.debug('No cached sensitivities found!')
+            _log.debug("No cached sensitivities found!")
             _, sens = self.__call__(*inputs, return_sens=True)
         ograds = output_grads[0]
-        
+
         # for each parameter, multiply sensitivities with the output gradient and sum the result
         # sens is (n_times, n_states, n_p)
         # ograds is (n_times, n_states)
-        grads = [
-            tt.sum(sens[:,:,p] * ograds)
-            for p in range(self.n_p)
-        ]
-        
+        grads = [tt.sum(sens[:, :, p] * ograds) for p in range(self.n_p)]
+
         # return separate gradient tensors for y0 and theta inputs
-        result = tt.stack(grads[:self.n_states]), tt.stack(grads[self.n_states:])
+        result = tt.stack(grads[: self.n_states]), tt.stack(grads[self.n_states :])
         return result
diff --git a/pymc3/ode/utils.py b/pymc3/ode/utils.py
index 5e9e69d475..141c5503f1 100644
--- a/pymc3/ode/utils.py
+++ b/pymc3/ode/utils.py
@@ -18,43 +18,43 @@
 
 
 def make_sens_ic(n_states, n_theta, floatX):
-        r"""
-        The sensitivity matrix will always have consistent form. (n_states, n_states + n_theta)
+    r"""
+    The sensitivity matrix will always have consistent form. (n_states, n_states + n_theta)
 
-        If the first n_states entries of the parameters vector in the simulate call
-        correspond to initial conditions of the system,
-        then the first n_states columns of the sensitivity matrix should form
-        an identity matrix.
+    If the first n_states entries of the parameters vector in the simulate call
+    correspond to initial conditions of the system,
+    then the first n_states columns of the sensitivity matrix should form
+    an identity matrix.
 
-        If the last n_theta entries of the parameters vector in the simulate call
-        correspond to ode paramaters, then the last n_theta columns in
-        the sensitivity matrix will be 0.
+    If the last n_theta entries of the parameters vector in the simulate call
+    correspond to ode paramaters, then the last n_theta columns in
+    the sensitivity matrix will be 0.
 
-        Parameters
-        ----------
-        n_states : int
-            Number of state variables in the ODE
-        n_theta : int
-            Number of ODE parameters
-        floatX : str
-            dtype to be used for the array
+    Parameters
+    ----------
+    n_states : int
+        Number of state variables in the ODE
+    n_theta : int
+        Number of ODE parameters
+    floatX : str
+        dtype to be used for the array
 
-        Returns
-        -------
-        dydp : array
-            1D-array of shape (n_states * (n_states + n_theta),), representing the initial condition of the sensitivities
-        """
+    Returns
+    -------
+    dydp : array
+        1D-array of shape (n_states * (n_states + n_theta),), representing the initial condition of the sensitivities
+    """
 
-        # Initialize the sensitivity matrix to be 0 everywhere
-        sens_matrix = np.zeros((n_states, n_states + n_theta), dtype=floatX)
+    # Initialize the sensitivity matrix to be 0 everywhere
+    sens_matrix = np.zeros((n_states, n_states + n_theta), dtype=floatX)
 
-        # Slip in the identity matrix in the appropirate place
-        sens_matrix[:,:n_states] = np.eye(n_states, dtype=floatX)
+    # Slip in the identity matrix in the appropirate place
+    sens_matrix[:, :n_states] = np.eye(n_states, dtype=floatX)
 
-        # We need the sensitivity matrix to be a vector (see augmented_function)
-        # Ravel and return
-        dydp = sens_matrix.ravel()
-        return dydp
+    # We need the sensitivity matrix to be a vector (see augmented_function)
+    # Ravel and return
+    dydp = sens_matrix.ravel()
+    return dydp
 
 
 def augment_system(ode_func, n_states, n_theta):
@@ -83,21 +83,21 @@ def augment_system(ode_func, n_states, n_theta):
     """
 
     # Present state of the system
-    t_y = tt.vector("y", dtype='float64')
-    t_y.tag.test_value = np.ones((n_states,), dtype='float64')
+    t_y = tt.vector("y", dtype="float64")
+    t_y.tag.test_value = np.ones((n_states,), dtype="float64")
     # Parameter(s).  Should be vector to allow for generaliztion to multiparameter
     # systems of ODEs.  Is m dimensional because it includes all initial conditions as well as ode parameters
-    t_p = tt.vector("p", dtype='float64')
-    t_p.tag.test_value = np.ones((n_states + n_theta,), dtype='float64')
+    t_p = tt.vector("p", dtype="float64")
+    t_p.tag.test_value = np.ones((n_states + n_theta,), dtype="float64")
     # Time.  Allow for non-automonous systems of ODEs to be analyzed
-    t_t = tt.scalar("t", dtype='float64')
+    t_t = tt.scalar("t", dtype="float64")
     t_t.tag.test_value = 2.459
 
     # Present state of the gradients:
     # Will always be 0 unless the parameter is the inital condition
     # Entry i,j is partial of y[i] wrt to p[j]
-    dydp_vec = tt.vector("dydp", dtype='float64')
-    dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, 'float64')
+    dydp_vec = tt.vector("dydp", dtype="float64")
+    dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, "float64")
 
     dydp = dydp_vec.reshape((n_states, n_states + n_theta))
 
@@ -119,9 +119,7 @@ def augment_system(ode_func, n_states, n_theta):
     ddt_dydp = (Jdfdy + grad_f).flatten()
 
     system = theano.function(
-        inputs=[t_y, t_t, t_p, dydp_vec],
-        outputs=[t_yhat, ddt_dydp],
-        on_unused_input="ignore"
+        inputs=[t_y, t_t, t_p, dydp_vec], outputs=[t_yhat, ddt_dydp], on_unused_input="ignore"
     )
 
     return system
diff --git a/pymc3/parallel_sampling.py b/pymc3/parallel_sampling.py
index fb4b464b01..c1f13f9bbd 100644
--- a/pymc3/parallel_sampling.py
+++ b/pymc3/parallel_sampling.py
@@ -109,18 +109,16 @@ def _unpickle_step_method(self):
             "or forkserver."
         )
         if self._step_method_is_pickled:
-            if self._pickle_backend == 'pickle':
+            if self._pickle_backend == "pickle":
                 try:
                     self._step_method = pickle.loads(self._step_method)
                 except Exception:
                     raise ValueError(unpickle_error)
-            elif self._pickle_backend == 'dill':
+            elif self._pickle_backend == "dill":
                 try:
                     import dill
                 except ImportError:
-                    raise ValueError(
-                        "dill must be installed for pickle_backend='dill'."
-                    )
+                    raise ValueError("dill must be installed for pickle_backend='dill'.")
                 try:
                     self._step_method = dill.loads(self._step_method)
                 except Exception:
@@ -206,9 +204,7 @@ def _start_loop(self):
                     warns = self._collect_warnings()
                 else:
                     warns = None
-                self._msg_pipe.send(
-                    ("writing_done", is_last, draw, tuning, stats, warns)
-                )
+                self._msg_pipe.send(("writing_done", is_last, draw, tuning, stats, warns))
                 draw += 1
             else:
                 raise ValueError("Unknown message " + msg[0])
@@ -289,7 +285,7 @@ def __init__(
                 tune,
                 seed,
                 pickle_backend,
-            )
+            ),
         )
         self._process.start()
         # Close the remote pipe, so that we get notified if the other
@@ -318,11 +314,7 @@ def _send(self, msg, *args):
             if message is not None and message[0] == "error":
                 warns, old_error = message[1:]
                 if warns is not None:
-                    error = ParallelSamplingError(
-                        str(old_error),
-                        self.chain,
-                        warns
-                    )
+                    error = ParallelSamplingError(str(old_error), self.chain, warns)
                 else:
                     error = RuntimeError("Chain %s failed." % self.chain)
                 raise error from old_error
@@ -387,8 +379,7 @@ def terminate_all(processes, patience=2):
                 process.join(timeout)
         except multiprocessing.TimeoutError:
             logger.warn(
-                "Chain processes did not terminate as expected. "
-                "Terminating forcefully..."
+                "Chain processes did not terminate as expected. " "Terminating forcefully..."
             )
             for process in processes:
                 process.terminate()
@@ -396,9 +387,7 @@ def terminate_all(processes, patience=2):
                 process.join()
 
 
-Draw = namedtuple(
-    "Draw", ["chain", "is_last", "draw_idx", "tuning", "stats", "point", "warnings"]
-)
+Draw = namedtuple("Draw", ["chain", "is_last", "draw_idx", "tuning", "stats", "point", "warnings"])
 
 
 class ParallelSampler:
@@ -414,7 +403,7 @@ def __init__(
         start_chain_num: int = 0,
         progressbar: bool = True,
         mp_ctx=None,
-        pickle_backend: str = 'pickle',
+        pickle_backend: str = "pickle",
     ):
 
         if any(len(arg) != chains for arg in [seeds, start_points]):
@@ -422,21 +411,19 @@ def __init__(
 
         if mp_ctx is None or isinstance(mp_ctx, str):
             # Closes issue https://github.com/pymc-devs/pymc3/issues/3849
-            if platform.system() == 'Darwin':
+            if platform.system() == "Darwin":
                 mp_ctx = "forkserver"
             mp_ctx = multiprocessing.get_context(mp_ctx)
 
         step_method_pickled = None
-        if mp_ctx.get_start_method() != 'fork':
-            if pickle_backend == 'pickle':
+        if mp_ctx.get_start_method() != "fork":
+            if pickle_backend == "pickle":
                 step_method_pickled = pickle.dumps(step_method, protocol=-1)
-            elif pickle_backend == 'dill':
+            elif pickle_backend == "dill":
                 try:
                     import dill
                 except ImportError:
-                    raise ValueError(
-                        "dill must be installed for pickle_backend='dill'."
-                    )
+                    raise ValueError("dill must be installed for pickle_backend='dill'.")
                 step_method_pickled = dill.dumps(step_method, protocol=-1)
 
         self._samplers = [
@@ -449,7 +436,7 @@ def __init__(
                 seed,
                 start,
                 mp_ctx,
-                pickle_backend
+                pickle_backend,
             )
             for chain, seed, start in zip(range(chains), seeds, start_points)
         ]
@@ -468,9 +455,7 @@ def __init__(
         self._desc = "Sampling {0._chains:d} chains, {0._divergences:,d} divergences"
         self._chains = chains
         if progressbar:
-            self._progress = progress_bar(
-                range(chains * (draws + tune)), display=progressbar
-            )
+            self._progress = progress_bar(range(chains * (draws + tune)), display=progressbar)
             self._progress.comment = self._desc.format(self)
 
     def _make_active(self):
diff --git a/pymc3/plots/posteriorplot.py b/pymc3/plots/posteriorplot.py
index 76e3b3743a..fc44e914c8 100644
--- a/pymc3/plots/posteriorplot.py
+++ b/pymc3/plots/posteriorplot.py
@@ -18,6 +18,7 @@
     pass
 import numpy as np
 
+
 def plot_posterior_predictive_glm(trace, eval=None, lm=None, samples=30, **kwargs):
     """Plot posterior predictive of a linear model.
     :Arguments:
@@ -35,21 +36,21 @@ def plot_posterior_predictive_glm(trace, eval=None, lm=None, samples=30, **kwarg
     Additional keyword arguments are passed to pylab.plot().
     """
     if lm is None:
-        lm = lambda x, sample: sample['Intercept'] + sample['x'] * x
+        lm = lambda x, sample: sample["Intercept"] + sample["x"] * x
 
     if eval is None:
         eval = np.linspace(0, 1, 100)
 
     # Set default plotting arguments
-    if 'lw' not in kwargs and 'linewidth' not in kwargs:
-        kwargs['lw'] = .2
-    if 'c' not in kwargs and 'color' not in kwargs:
-        kwargs['c'] = 'k'
+    if "lw" not in kwargs and "linewidth" not in kwargs:
+        kwargs["lw"] = 0.2
+    if "c" not in kwargs and "color" not in kwargs:
+        kwargs["c"] = "k"
 
     for rand_loc in np.random.randint(0, len(trace), samples):
         rand_sample = trace[rand_loc]
         plt.plot(eval, lm(eval, rand_sample), **kwargs)
-    # Make sure to not plot label multiple times
-        kwargs.pop('label', None)
+        # Make sure to not plot label multiple times
+        kwargs.pop("label", None)
 
-    plt.title('Posterior predictive')
+    plt.title("Posterior predictive")
diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py
index 308a1aa2e5..4dfd9fb2ca 100644
--- a/pymc3/tests/test_mixture.py
+++ b/pymc3/tests/test_mixture.py
@@ -18,8 +18,20 @@
 
 from .helpers import SeededTest
 import pymc3 as pm
-from pymc3 import Dirichlet, Gamma, Normal, Lognormal, Poisson, Exponential, \
-    Mixture, NormalMixture, MvNormal, sample, Metropolis, Model
+from pymc3 import (
+    Dirichlet,
+    Gamma,
+    Normal,
+    Lognormal,
+    Poisson,
+    Exponential,
+    Mixture,
+    NormalMixture,
+    MvNormal,
+    sample,
+    Metropolis,
+    Model,
+)
 import scipy.stats as st
 from scipy.special import logsumexp
 from pymc3.theanof import floatX
@@ -58,12 +70,12 @@ def setup_class(cls):
         super().setup_class()
 
         cls.norm_w = np.array([0.75, 0.25])
-        cls.norm_mu = np.array([0., 5.])
+        cls.norm_mu = np.array([0.0, 5.0])
         cls.norm_sd = np.ones_like(cls.norm_mu)
         cls.norm_x = generate_normal_mixture_data(cls.norm_w, cls.norm_mu, cls.norm_sd, size=1000)
 
         cls.pois_w = np.array([0.4, 0.6])
-        cls.pois_mu = np.array([5., 20.])
+        cls.pois_mu = np.array([5.0, 20.0])
         cls.pois_x = generate_poisson_mixture_data(cls.pois_w, cls.pois_mu, size=1000)
 
     def test_dimensions(self):
@@ -79,79 +91,68 @@ def test_dimensions(self):
 
     def test_mixture_list_of_normals(self):
         with Model() as model:
-            w = Dirichlet('w', floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
-            mu = Normal('mu', 0., 10., shape=self.norm_w.size)
-            tau = Gamma('tau', 1., 1., shape=self.norm_w.size)
-            Mixture('x_obs', w,
-                    [Normal.dist(mu[0], tau=tau[0]), Normal.dist(mu[1], tau=tau[1])],
-                    observed=self.norm_x)
+            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
+            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
+            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
+            Mixture(
+                "x_obs",
+                w,
+                [Normal.dist(mu[0], tau=tau[0]), Normal.dist(mu[1], tau=tau[1])],
+                observed=self.norm_x,
+            )
             step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed,
-                           progressbar=False, chains=1)
+            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
 
-        assert_allclose(np.sort(trace['w'].mean(axis=0)),
-                        np.sort(self.norm_w),
-                        rtol=0.1, atol=0.1)
-        assert_allclose(np.sort(trace['mu'].mean(axis=0)),
-                        np.sort(self.norm_mu),
-                        rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
+        assert_allclose(
+            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
+        )
 
     def test_normal_mixture(self):
         with Model() as model:
-            w = Dirichlet('w', floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
-            mu = Normal('mu', 0., 10., shape=self.norm_w.size)
-            tau = Gamma('tau', 1., 1., shape=self.norm_w.size)
-            NormalMixture('x_obs', w, mu, tau=tau, observed=self.norm_x)
+            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
+            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
+            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
+            NormalMixture("x_obs", w, mu, tau=tau, observed=self.norm_x)
             step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed,
-                           progressbar=False, chains=1)
-
-        assert_allclose(np.sort(trace['w'].mean(axis=0)),
-                        np.sort(self.norm_w),
-                        rtol=0.1, atol=0.1)
-        assert_allclose(np.sort(trace['mu'].mean(axis=0)),
-                        np.sort(self.norm_mu),
-                        rtol=0.1, atol=0.1)
-
-    @pytest.mark.parametrize('nd,ncomp',
-                             [(tuple(), 5),
-                              (1, 5),
-                              (3, 5),
-                              ((3, 3), 5),
-                              (3, 3),
-                              ((3, 3), 3)],
-                             ids=str)
+            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
+        assert_allclose(
+            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
+        )
+
+    @pytest.mark.parametrize(
+        "nd,ncomp", [(tuple(), 5), (1, 5), (3, 5), ((3, 3), 5), (3, 3), ((3, 3), 3)], ids=str
+    )
     def test_normal_mixture_nd(self, nd, ncomp):
         nd = to_tuple(nd)
         ncomp = int(ncomp)
         comp_shape = nd + (ncomp,)
         test_mus = np.random.randn(*comp_shape)
         test_taus = np.random.gamma(1, 1, size=comp_shape)
-        observed = generate_normal_mixture_data(w=np.ones(ncomp)/ncomp,
-                                                mu=test_mus,
-                                                sd=1/np.sqrt(test_taus),
-                                                size=10)
+        observed = generate_normal_mixture_data(
+            w=np.ones(ncomp) / ncomp, mu=test_mus, sd=1 / np.sqrt(test_taus), size=10
+        )
 
         with Model() as model0:
-            mus = Normal('mus', shape=comp_shape)
-            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet('ws', np.ones(ncomp), shape=(ncomp,))
-            mixture0 = NormalMixture('m', w=ws, mu=mus, tau=taus, shape=nd,
-                                     comp_shape=comp_shape)
-            obs0 = NormalMixture('obs', w=ws, mu=mus, tau=taus, shape=nd,
-                                 comp_shape=comp_shape,
-                                 observed=observed)
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
+            mixture0 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape)
+            obs0 = NormalMixture(
+                "obs", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape, observed=observed
+            )
 
         with Model() as model1:
-            mus = Normal('mus', shape=comp_shape)
-            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet('ws', np.ones(ncomp), shape=(ncomp,))
-            comp_dist = [Normal.dist(mu=mus[..., i], tau=taus[..., i],
-                                     shape=nd)
-                         for i in range(ncomp)]
-            mixture1 = Mixture('m', w=ws, comp_dists=comp_dist, shape=nd)
-            obs1 = Mixture('obs', w=ws, comp_dists=comp_dist, shape=nd,
-                           observed=observed)
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
+            comp_dist = [
+                Normal.dist(mu=mus[..., i], tau=taus[..., i], shape=nd) for i in range(ncomp)
+            ]
+            mixture1 = Mixture("m", w=ws, comp_dists=comp_dist, shape=nd)
+            obs1 = Mixture("obs", w=ws, comp_dists=comp_dist, shape=nd, observed=observed)
 
         with Model() as model2:
             # Expected to fail if comp_shape is not provided,
@@ -161,21 +162,18 @@ def test_normal_mixture_nd(self, nd, ncomp):
             # Furthermore, the Mixture will also raise errors when the observed
             # data is multidimensional but it does not broadcast well with
             # comp_dists.
-            mus = Normal('mus', shape=comp_shape)
-            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet('ws', np.ones(ncomp), shape=(ncomp,))
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
             if len(nd) > 1:
                 if nd[-1] != ncomp:
                     with pytest.raises(ValueError):
-                        NormalMixture('m', w=ws, mu=mus, tau=taus,
-                                      shape=nd)
+                        NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
                     mixture2 = None
                 else:
-                    mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus,
-                                             shape=nd)
+                    mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
             else:
-                mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus,
-                                         shape=nd)
+                mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
             observed_fails = False
             if len(nd) >= 1 and nd != (1,):
                 try:
@@ -184,18 +182,14 @@ def test_normal_mixture_nd(self, nd, ncomp):
                     observed_fails = True
             if observed_fails:
                 with pytest.raises(ValueError):
-                    NormalMixture('obs', w=ws, mu=mus, tau=taus,
-                                  shape=nd,
-                                  observed=observed)
+                    NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
                 obs2 = None
             else:
-                obs2 = NormalMixture('obs', w=ws, mu=mus, tau=taus,
-                                     shape=nd,
-                                     observed=observed)
+                obs2 = NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
 
         testpoint = model0.test_point
-        testpoint['mus'] = test_mus
-        testpoint['taus'] = test_taus
+        testpoint["mus"] = test_mus
+        testpoint["taus"] = test_taus
         assert_allclose(model0.logp(testpoint), model1.logp(testpoint))
         assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint))
         assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint))
@@ -208,74 +202,66 @@ def test_normal_mixture_nd(self, nd, ncomp):
 
     def test_poisson_mixture(self):
         with Model() as model:
-            w = Dirichlet('w', floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
-            mu = Gamma('mu', 1., 1., shape=self.pois_w.size)
-            Mixture('x_obs', w, Poisson.dist(mu), observed=self.pois_x)
+            w = Dirichlet("w", floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
+            mu = Gamma("mu", 1.0, 1.0, shape=self.pois_w.size)
+            Mixture("x_obs", w, Poisson.dist(mu), observed=self.pois_x)
             step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed,
-                           progressbar=False, chains=1)
+            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
 
-        assert_allclose(np.sort(trace['w'].mean(axis=0)),
-                        np.sort(self.pois_w),
-                        rtol=0.1, atol=0.1)
-        assert_allclose(np.sort(trace['mu'].mean(axis=0)),
-                        np.sort(self.pois_mu),
-                        rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.pois_w), rtol=0.1, atol=0.1)
+        assert_allclose(
+            np.sort(trace["mu"].mean(axis=0)), np.sort(self.pois_mu), rtol=0.1, atol=0.1
+        )
 
     def test_mixture_list_of_poissons(self):
         with Model() as model:
-            w = Dirichlet('w', floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
-            mu = Gamma('mu', 1., 1., shape=self.pois_w.size)
-            Mixture('x_obs', w,
-                    [Poisson.dist(mu[0]), Poisson.dist(mu[1])],
-                    observed=self.pois_x)
+            w = Dirichlet("w", floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
+            mu = Gamma("mu", 1.0, 1.0, shape=self.pois_w.size)
+            Mixture("x_obs", w, [Poisson.dist(mu[0]), Poisson.dist(mu[1])], observed=self.pois_x)
             step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed,
-                           progressbar=False, chains=1)
+            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
 
-        assert_allclose(np.sort(trace['w'].mean(axis=0)),
-                        np.sort(self.pois_w),
-                        rtol=0.1, atol=0.1)
-        assert_allclose(np.sort(trace['mu'].mean(axis=0)),
-                        np.sort(self.pois_mu),
-                        rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.pois_w), rtol=0.1, atol=0.1)
+        assert_allclose(
+            np.sort(trace["mu"].mean(axis=0)), np.sort(self.pois_mu), rtol=0.1, atol=0.1
+        )
 
     def test_mixture_of_mvn(self):
-        mu1 = np.asarray([0., 1.])
+        mu1 = np.asarray([0.0, 1.0])
         cov1 = np.diag([1.5, 2.5])
-        mu2 = np.asarray([1., 0.])
+        mu2 = np.asarray([1.0, 0.0])
         cov2 = np.diag([2.5, 3.5])
-        obs = np.asarray([[.5, .5], mu1, mu2])
+        obs = np.asarray([[0.5, 0.5], mu1, mu2])
         with Model() as model:
-            w = Dirichlet('w', floatX(np.ones(2)), transform=None, shape=(2,))
+            w = Dirichlet("w", floatX(np.ones(2)), transform=None, shape=(2,))
             mvncomp1 = MvNormal.dist(mu=mu1, cov=cov1)
             mvncomp2 = MvNormal.dist(mu=mu2, cov=cov2)
-            y = Mixture('x_obs', w, [mvncomp1, mvncomp2],
-                    observed=obs)
+            y = Mixture("x_obs", w, [mvncomp1, mvncomp2], observed=obs)
 
         # check logp of each component
-        complogp_st = np.vstack((st.multivariate_normal.logpdf(obs, mu1, cov1),
-                                 st.multivariate_normal.logpdf(obs, mu2, cov2))
-                                ).T
+        complogp_st = np.vstack(
+            (
+                st.multivariate_normal.logpdf(obs, mu1, cov1),
+                st.multivariate_normal.logpdf(obs, mu2, cov2),
+            )
+        ).T
         complogp = y.distribution._comp_logp(theano.shared(obs)).eval()
         assert_allclose(complogp, complogp_st)
 
         # check logp of mixture
         testpoint = model.test_point
-        mixlogp_st = logsumexp(np.log(testpoint['w']) + complogp_st,
-                               axis=-1, keepdims=False)
-        assert_allclose(y.logp_elemwise(testpoint),
-                        mixlogp_st)
+        mixlogp_st = logsumexp(np.log(testpoint["w"]) + complogp_st, axis=-1, keepdims=False)
+        assert_allclose(y.logp_elemwise(testpoint), mixlogp_st)
 
         # check logp of model
-        priorlogp = st.dirichlet.logpdf(x=testpoint['w'],
-                                        alpha=np.ones(2),
-                                        )
-        assert_allclose(model.logp(testpoint),
-                        mixlogp_st.sum() + priorlogp)
+        priorlogp = st.dirichlet.logpdf(
+            x=testpoint["w"],
+            alpha=np.ones(2),
+        )
+        assert_allclose(model.logp(testpoint), mixlogp_st.sum() + priorlogp)
 
     def test_mixture_of_mixture(self):
-        if theano.config.floatX == 'float32':
+        if theano.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
@@ -283,79 +269,70 @@ def test_mixture_of_mixture(self):
         with Model() as model:
             # mixtures components
             g_comp = Normal.dist(
-                mu=Exponential('mu_g', lam=1.0, shape=nbr, transform=None),
-                sigma=1,
-                shape=nbr)
+                mu=Exponential("mu_g", lam=1.0, shape=nbr, transform=None), sigma=1, shape=nbr
+            )
             l_comp = Lognormal.dist(
-                mu=Exponential('mu_l', lam=1.0, shape=nbr, transform=None),
-                sigma=1,
-                shape=nbr)
+                mu=Exponential("mu_l", lam=1.0, shape=nbr, transform=None), sigma=1, shape=nbr
+            )
             # weight vector for the mixtures
-            g_w = Dirichlet('g_w', a=floatX(np.ones(nbr)*0.0000001), transform=None, shape=(nbr,))
-            l_w = Dirichlet('l_w', a=floatX(np.ones(nbr)*0.0000001), transform=None, shape=(nbr,))
+            g_w = Dirichlet("g_w", a=floatX(np.ones(nbr) * 0.0000001), transform=None, shape=(nbr,))
+            l_w = Dirichlet("l_w", a=floatX(np.ones(nbr) * 0.0000001), transform=None, shape=(nbr,))
             # mixture components
             g_mix = Mixture.dist(w=g_w, comp_dists=g_comp)
             l_mix = Mixture.dist(w=l_w, comp_dists=l_comp)
             # mixture of mixtures
-            mix_w = Dirichlet('mix_w', a=floatX(np.ones(2)), transform=None, shape=(2,))
-            mix = Mixture('mix', w=mix_w,
-                          comp_dists=[g_mix, l_mix],
-                          observed=np.exp(self.norm_x))
+            mix_w = Dirichlet("mix_w", a=floatX(np.ones(2)), transform=None, shape=(2,))
+            mix = Mixture("mix", w=mix_w, comp_dists=[g_mix, l_mix], observed=np.exp(self.norm_x))
 
         test_point = model.test_point
 
         def mixmixlogp(value, point):
             floatX = theano.config.floatX
-            priorlogp = st.dirichlet.logpdf(x=point['g_w'],
-                                            alpha=np.ones(nbr)*0.0000001,
-                                            ).astype(floatX) + \
-                        st.expon.logpdf(x=point['mu_g']).sum(dtype=floatX) + \
-                        st.dirichlet.logpdf(x=point['l_w'],
-                                            alpha=np.ones(nbr)*0.0000001,
-                                            ).astype(floatX) + \
-                        st.expon.logpdf(x=point['mu_l']).sum(dtype=floatX) + \
-                        st.dirichlet.logpdf(x=point['mix_w'],
-                                            alpha=np.ones(2),
-                                            ).astype(floatX)
-            complogp1 = st.norm.logpdf(x=value,
-                                       loc=point['mu_g']).astype(floatX)
-            mixlogp1 = logsumexp(np.log(point['g_w']).astype(floatX) +
-                                 complogp1,
-                                 axis=-1, keepdims=True)
-            complogp2 = st.lognorm.logpdf(value,
-                                          1.,
-                                          0.,
-                                          np.exp(point['mu_l'])).astype(floatX)
-            mixlogp2 = logsumexp(np.log(point['l_w']).astype(floatX) +
-                                 complogp2,
-                                 axis=-1, keepdims=True)
+            priorlogp = (
+                st.dirichlet.logpdf(
+                    x=point["g_w"],
+                    alpha=np.ones(nbr) * 0.0000001,
+                ).astype(floatX)
+                + st.expon.logpdf(x=point["mu_g"]).sum(dtype=floatX)
+                + st.dirichlet.logpdf(
+                    x=point["l_w"],
+                    alpha=np.ones(nbr) * 0.0000001,
+                ).astype(floatX)
+                + st.expon.logpdf(x=point["mu_l"]).sum(dtype=floatX)
+                + st.dirichlet.logpdf(
+                    x=point["mix_w"],
+                    alpha=np.ones(2),
+                ).astype(floatX)
+            )
+            complogp1 = st.norm.logpdf(x=value, loc=point["mu_g"]).astype(floatX)
+            mixlogp1 = logsumexp(
+                np.log(point["g_w"]).astype(floatX) + complogp1, axis=-1, keepdims=True
+            )
+            complogp2 = st.lognorm.logpdf(value, 1.0, 0.0, np.exp(point["mu_l"])).astype(floatX)
+            mixlogp2 = logsumexp(
+                np.log(point["l_w"]).astype(floatX) + complogp2, axis=-1, keepdims=True
+            )
             complogp_mix = np.concatenate((mixlogp1, mixlogp2), axis=1)
-            mixmixlogpg = logsumexp(np.log(point['mix_w']).astype(floatX) +
-                                    complogp_mix,
-                                    axis=-1, keepdims=False)
+            mixmixlogpg = logsumexp(
+                np.log(point["mix_w"]).astype(floatX) + complogp_mix, axis=-1, keepdims=False
+            )
             return priorlogp, mixmixlogpg
 
         value = np.exp(self.norm_x)[:, None]
         priorlogp, mixmixlogpg = mixmixlogp(value, test_point)
 
         # check logp of mixture
-        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point),
-                        rtol=rtol)
+        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), rtol=rtol)
 
         # check model logp
-        assert_allclose(priorlogp + mixmixlogpg.sum(),
-                        model.logp(test_point),
-                        rtol=rtol)
+        assert_allclose(priorlogp + mixmixlogpg.sum(), model.logp(test_point), rtol=rtol)
 
         # check input and check logp again
-        test_point['g_w'] = np.asarray([.1, .1, .2, .6])
-        test_point['mu_g'] = np.exp(np.random.randn(nbr))
+        test_point["g_w"] = np.asarray([0.1, 0.1, 0.2, 0.6])
+        test_point["mu_g"] = np.exp(np.random.randn(nbr))
         priorlogp, mixmixlogpg = mixmixlogp(value, test_point)
-        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point),
-                        rtol=rtol)
-        assert_allclose(priorlogp + mixmixlogpg.sum(),
-                        model.logp(test_point),
-                        rtol=rtol)
+        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), rtol=rtol)
+        assert_allclose(priorlogp + mixmixlogpg.sum(), model.logp(test_point), rtol=rtol)
 
     def test_sample_prior_and_posterior(self):
         def build_toy_dataset(N, K):
@@ -366,8 +343,7 @@ def build_toy_dataset(N, K):
             y = np.zeros((N,), dtype=np.int)
             for n in range(N):
                 k = np.argmax(np.random.multinomial(1, pi))
-                x[n, :] = np.random.multivariate_normal(mus[k],
-                                                        np.diag(stds[k]))
+                x[n, :] = np.random.multivariate_normal(mus[k], np.diag(stds[k]))
                 y[n] = k
             return x, y
 
@@ -378,25 +354,23 @@ def build_toy_dataset(N, K):
         X, y = build_toy_dataset(N, K)
 
         with pm.Model() as model:
-            pi = pm.Dirichlet('pi', np.ones(K), shape=(K,))
+            pi = pm.Dirichlet("pi", np.ones(K), shape=(K,))
 
             comp_dist = []
             mu = []
             packed_chol = []
             chol = []
             for i in range(K):
-                mu.append(pm.Normal('mu%i' % i, 0, 10, shape=D))
+                mu.append(pm.Normal("mu%i" % i, 0, 10, shape=D))
                 packed_chol.append(
-                    pm.LKJCholeskyCov('chol_cov_%i' % i,
-                                      eta=2,
-                                      n=D,
-                                      sd_dist=pm.HalfNormal.dist(2.5))
+                    pm.LKJCholeskyCov(
+                        "chol_cov_%i" % i, eta=2, n=D, sd_dist=pm.HalfNormal.dist(2.5)
+                    )
                 )
-                chol.append(pm.expand_packed_triangular(D, packed_chol[i],
-                                                        lower=True))
+                chol.append(pm.expand_packed_triangular(D, packed_chol[i], lower=True))
                 comp_dist.append(pm.MvNormal.dist(mu=mu[i], chol=chol[i]))
 
-            pm.Mixture('x_obs', pi, comp_dist, observed=X)
+            pm.Mixture("x_obs", pi, comp_dist, observed=X)
         with model:
             trace = pm.sample(30, tune=10, chains=1)
 
@@ -404,10 +378,10 @@ def build_toy_dataset(N, K):
         with model:
             ppc = pm.sample_posterior_predictive(trace, n_samples)
             prior = pm.sample_prior_predictive(samples=n_samples)
-        assert ppc['x_obs'].shape == (n_samples,) + X.shape
-        assert prior['x_obs'].shape == (n_samples,) + X.shape
-        assert prior['mu0'].shape == (n_samples, D)
-        assert prior['chol_cov_0'].shape == (n_samples, D * (D + 1) // 2)
+        assert ppc["x_obs"].shape == (n_samples,) + X.shape
+        assert prior["x_obs"].shape == (n_samples,) + X.shape
+        assert prior["mu0"].shape == (n_samples, D)
+        assert prior["chol_cov_0"].shape == (n_samples, D * (D + 1) // 2)
 
 
 class TestMixtureVsLatent(SeededTest):
@@ -419,9 +393,15 @@ def setup_method(self, *args, **kwargs):
             np.tile(
                 np.reshape(
                     np.arange(self.npop),
-                    (1, -1,)
+                    (
+                        1,
+                        -1,
+                    ),
+                ),
+                (
+                    self.nd,
+                    1,
                 ),
-                (self.nd, 1,)
             )
         )
 
@@ -431,17 +411,11 @@ def test_1d_w(self):
         mus = self.mus
         size = 100
         with pm.Model() as model:
-            m = pm.NormalMixture('m',
-                                 w=np.ones(npop) / npop,
-                                 mu=mus,
-                                 sigma=1e-5,
-                                 comp_shape=(nd, npop),
-                                 shape=nd)
-            z = pm.Categorical('z', p=np.ones(npop) / npop)
-            latent_m = pm.Normal('latent_m',
-                                 mu=mus[..., z],
-                                 sigma=1e-5,
-                                 shape=nd)
+            m = pm.NormalMixture(
+                "m", w=np.ones(npop) / npop, mu=mus, sigma=1e-5, comp_shape=(nd, npop), shape=nd
+            )
+            z = pm.Categorical("z", p=np.ones(npop) / npop)
+            latent_m = pm.Normal("latent_m", mu=mus[..., z], sigma=1e-5, shape=nd)
 
         m_val = m.random(size=size)
         latent_m_val = latent_m.random(size=size)
@@ -460,18 +434,17 @@ def test_2d_w(self):
         mus = self.mus
         size = 100
         with pm.Model() as model:
-            m = pm.NormalMixture('m',
-                                 w=np.ones((nd, npop)) / npop,
-                                 mu=mus,
-                                 sigma=1e-5,
-                                 comp_shape=(nd, npop),
-                                 shape=nd)
-            z = pm.Categorical('z', p=np.ones(npop) / npop, shape=nd)
+            m = pm.NormalMixture(
+                "m",
+                w=np.ones((nd, npop)) / npop,
+                mu=mus,
+                sigma=1e-5,
+                comp_shape=(nd, npop),
+                shape=nd,
+            )
+            z = pm.Categorical("z", p=np.ones(npop) / npop, shape=nd)
             mu = tt.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
-            latent_m = pm.Normal('latent_m',
-                                 mu=mu,
-                                 sigma=1e-5,
-                                 shape=nd)
+            latent_m = pm.Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
 
         m_val = m.random(size=size)
         latent_m_val = latent_m.random(size=size)
@@ -489,32 +462,30 @@ def samples_from_same_distribution(self, *args):
         _, p_marginal = st.ks_2samp(*[s.flatten() for s in args])
         # Test if correlations within non independent draws match
         _, p_correlation = st.ks_2samp(
-            *[np.array([np.corrcoef(ss) for ss in s]).flatten()
-              for s in args]
+            *[np.array([np.corrcoef(ss) for ss in s]).flatten() for s in args]
         )
         assert p_marginal >= 0.05 and p_correlation >= 0.05
 
     def logp_matches(self, mixture, latent_mix, z, npop, model):
-        if theano.config.floatX == 'float32':
+        if theano.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
         test_point = model.test_point
-        test_point['latent_m'] = test_point['m']
+        test_point["latent_m"] = test_point["m"]
         mix_logp = mixture.logp(test_point)
         logps = []
         for component in range(npop):
-            test_point['z'] = component * np.ones(z.distribution.shape)
+            test_point["z"] = component * np.ones(z.distribution.shape)
             # Count the number of axes that should be broadcasted from z to
             # modify the logp
-            sh1 = test_point['z'].shape
-            sh2 = test_point['latent_m'].shape
+            sh1 = test_point["z"].shape
+            sh2 = test_point["latent_m"].shape
             if len(sh1) > len(sh2):
                 sh2 = (1,) * (len(sh1) - len(sh2)) + sh2
             elif len(sh2) > len(sh1):
                 sh1 = (1,) * (len(sh2) - len(sh1)) + sh1
-            reps = np.prod([s2 if s1 != s2 else 1 for s1, s2 in
-                            zip(sh1, sh2)])
+            reps = np.prod([s2 if s1 != s2 else 1 for s1, s2 in zip(sh1, sh2)])
             z_logp = z.logp(test_point) * reps
             logps.append(z_logp + latent_mix.logp(test_point))
         latent_mix_logp = logsumexp(np.array(logps), axis=0)