From 47021d4c096cc7896379667c6534c9f09d0a141b Mon Sep 17 00:00:00 2001 From: Ricardo Date: Thu, 17 Dec 2020 19:42:36 +0100 Subject: [PATCH 1/9] Implement logcdf method for discrete distributions --- pymc3/distributions/discrete.py | 364 +++++++++++++++++++++++++++--- pymc3/tests/test_distributions.py | 211 ++++++++++++++++- 2 files changed, 532 insertions(+), 43 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index e639ba6684..786a55b27d 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -24,6 +24,7 @@ binomln, bound, factln, + incomplete_beta, log_diff_normal_cdf, logpow, normal_lccdf, @@ -32,7 +33,7 @@ ) from pymc3.distributions.distribution import Discrete, draw_values, generate_samples from pymc3.distributions.shape_utils import broadcast_distribution_samples -from pymc3.math import log1pexp, logaddexp, logit, sigmoid, tround +from pymc3.math import log1mexp, log1pexp, logaddexp, logit, logsumexp, sigmoid, tround from pymc3.theanof import floatX, intX, take_along_axis __all__ = [ @@ -148,6 +149,36 @@ def logp(self, value): p <= 1, ) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Binomial distribution + at the specified value. + + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + + Returns + ------- + TensorVariable + """ + # TODO: work with multiple values (see #4342) + n = self.n + p = self.p + value = tt.floor(value) + + return tt.switch( + tt.lt(value, n), + bound( + tt.log(incomplete_beta(n - value, value + 1, 1 - p)), + n > 0, + 0 <= p, + p <= 1, + ), + 0, + ) + class BetaBinomial(Discrete): R""" @@ -271,16 +302,40 @@ def logp(self, value): """ alpha = self.alpha beta = self.beta + n = self.n return bound( - binomln(self.n, value) - + betaln(value + alpha, self.n - value + beta) - - betaln(alpha, beta), + binomln(n, value) + betaln(value + alpha, n - value + beta) - betaln(alpha, beta), value >= 0, - value <= self.n, + value <= n, alpha > 0, beta > 0, ) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for BetaBinomial distribution + at the specified value. + + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + + Returns + ------- + TensorVariable + """ + # TODO: fix for multiple values + n = self.n + + return tt.switch( + tt.lt(value, 0), + -np.inf, + tt.switch( + tt.lt(value, n), logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), 0 + ), + ) + class Bernoulli(Discrete): R"""Bernoulli log-likelihood @@ -380,6 +435,31 @@ def logp(self, value): tt.switch(value, tt.log(p), tt.log(1 - p)), value >= 0, value <= 1, p >= 0, p <= 1 ) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Bernoulli distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + p = self.p + + return tt.switch( + tt.lt(value, 0), + -np.inf, + tt.switch( + tt.lt(value, 1), + tt.log(1 - p), + 0, + ), + ) + def _distr_parameters_for_repr(self): return ["p"] @@ -426,36 +506,11 @@ def DiscreteWeibull(q, b, x): def __init__(self, q, beta, *args, **kwargs): super().__init__(*args, defaults=("median",), **kwargs) - self.q = q = tt.as_tensor_variable(floatX(q)) - self.beta = beta = tt.as_tensor_variable(floatX(beta)) + self.q = tt.as_tensor_variable(floatX(q)) + self.beta = tt.as_tensor_variable(floatX(beta)) self.median = self._ppf(0.5) - def logp(self, value): - r""" - Calculate log-probability of DiscreteWeibull distribution at specified value. - - Parameters - ---------- - value: numeric - Value(s) for which log-probability is calculated. If the log probabilities for multiple - values are desired the values must be provided in a numpy array or theano tensor - - Returns - ------- - TensorVariable - """ - q = self.q - beta = self.beta - - return bound( - tt.log(tt.power(q, tt.power(value, beta)) - tt.power(q, tt.power(value + 1, beta))), - 0 <= value, - 0 < q, - q < 1, - 0 < beta, - ) - def _ppf(self, p): r""" The percentile point function (the inverse of the cumulative @@ -492,6 +547,57 @@ def random(self, point=None, size=None): return generate_samples(self._random, q, beta, dist_shape=self.shape, size=size) + def logp(self, value): + r""" + Calculate log-probability of DiscreteWeibull distribution at specified value. + + Parameters + ---------- + value: numeric + Value(s) for which log-probability is calculated. If the log probabilities for multiple + values are desired the values must be provided in a numpy array or theano tensor + + Returns + ------- + TensorVariable + """ + q = self.q + beta = self.beta + return bound( + tt.log(tt.power(q, tt.power(value, beta)) - tt.power(q, tt.power(value + 1, beta))), + 0 <= value, + 0 < q, + q < 1, + 0 < beta, + ) + + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Discrete Weibull distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + q = self.q + beta = self.beta + + return tt.switch( + tt.lt(value, 0), + -np.inf, + bound( + tt.log(1 - tt.power(q, tt.power(value + 1, beta))), + 0 < q, + q < 1, + 0 < beta, + ), + ) + class Poisson(Discrete): R""" @@ -581,6 +687,28 @@ def logp(self, value): # Return zero when mu and value are both zero return tt.switch(tt.eq(mu, 0) * tt.eq(value, 0), 0, log_prob) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Poisson distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + mu = self.mu + value = tt.floor(value) + + return bound( + tt.log(tt.gammaincc(value + 1, mu)), + value >= 0, + mu >= 0, + ) + class NegativeBinomial(Discrete): R""" @@ -737,6 +865,32 @@ def logp(self, value): # Return Poisson when alpha gets very large. return tt.switch(tt.gt(alpha, 1e10), Poisson.dist(self.mu).logp(value), negbinom) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for NegativeBinomial distribution + at the specified value. + + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + + Returns + ------- + TensorVariable + """ + # TODO: work with multiple values (see #4342) + # TODO: avoid `p` recomputation if distribution was defined in terms of `p` + alpha = self.alpha + p = alpha / (self.mu + alpha) + + return bound( + tt.log(incomplete_beta(alpha, tt.floor(value) + 1, p)), + alpha > 0, + 0 <= p, + p <= 1, + ) + def _distr_parameters_for_repr(self): return self._param_type @@ -820,6 +974,28 @@ def logp(self, value): p = self.p return bound(tt.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Geometric distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + p = self.p + + return bound( + log1mexp(-tt.log1p(-p) * value), + value >= 0, + 0 <= p, + p <= 1, + ) + class HyperGeometric(Discrete): R""" @@ -935,6 +1111,40 @@ def logp(self, value): upper = tt.switch(tt.lt(k, n), k, n) return bound(result, lower <= value, value <= upper) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for HyperGeometric distribution + at the specified value. + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + Returns + ------- + TensorVariable + """ + # TODO: fix for multiple values + N = self.N + n = self.n + k = self.k + + return tt.switch( + tt.lt(value, 0), + -np.inf, + tt.switch( + tt.lt(value, n), + logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), + bound( + 0, + N > 0, + k >= 0, + n >= 0, + k <= N, + n <= N, + ), + ), + ) + class DiscreteUniform(Discrete): R""" @@ -1025,6 +1235,24 @@ def logp(self, value): lower = self.lower return bound(-tt.log(upper - lower + 1), lower <= value, value <= upper) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for Discrete uniform distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + upper = self.upper + lower = self.lower + + return tt.log(tt.minimum(tt.floor(value), upper) - lower + 1) - tt.log(upper - lower + 1) + class Categorical(Discrete): R""" @@ -1263,7 +1491,7 @@ class ZeroInflatedPoisson(Discrete): def __init__(self, psi, theta, *args, **kwargs): super().__init__(*args, **kwargs) self.theta = theta = tt.as_tensor_variable(floatX(theta)) - self.psi = psi = tt.as_tensor_variable(floatX(psi)) + self.psi = tt.as_tensor_variable(floatX(psi)) self.pois = Poisson.dist(theta) self.mode = self.pois.mode @@ -1314,6 +1542,28 @@ def logp(self, value): return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, 0 <= theta) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for ZeroInflatedPoisson distribution + at the specified value. + Parameters + ---------- + value: numeric + Value(s) for which log CDF is calculated. If the log CDF for multiple + values are desired the values must be provided in a numpy array or theano tensor. + Returns + ------- + TensorVariable + """ + psi = self.psi + + return bound( + tt.log(1 - psi + psi * tt.exp(self.pois.logcdf(value))), + 0 <= value, + 0 <= psi, + psi <= 1, + ) + class ZeroInflatedBinomial(Discrete): R""" @@ -1422,6 +1672,30 @@ def logp(self, value): return bound(logp_val, 0 <= value, value <= n, 0 <= psi, psi <= 1, 0 <= p, p <= 1) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for ZeroInflatedBinomial distribution + at the specified value. + + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + + Returns + ------- + TensorVariable + """ + # TODO: work with multiple values (see #4342) + psi = self.psi + + return bound( + tt.log(1 - psi + psi * tt.exp(self.bin.logcdf(value))), + 0 <= value, + 0 <= psi, + psi <= 1, + ) + class ZeroInflatedNegativeBinomial(Discrete): R""" @@ -1561,6 +1835,30 @@ def logp(self, value): return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, mu > 0, alpha > 0) + def logcdf(self, value): + """ + Compute the log of the cumulative distribution function for ZeroInflatedNegativeBinomial distribution + at the specified value. + + Parameters + ---------- + value: numeric + Value for which log CDF is calculated. + + Returns + ------- + TensorVariable + """ + # TODO: work with multiple values (see #4342) + psi = self.psi + + return bound( + tt.log(1 - psi + psi * tt.exp(self.nb.logcdf(value))), + 0 <= value, + 0 <= psi, + psi <= 1, + ) + class OrderedLogistic(Categorical): R""" diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 16ed273488..1005b63c80 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -91,7 +91,7 @@ ZeroInflatedPoisson, continuous, ) -from pymc3.math import kronecker +from pymc3.math import kronecker, logsumexp from pymc3.model import Deterministic, Model, Point from pymc3.tests.helpers import SeededTest, select_by_precision from pymc3.theanof import floatX @@ -575,6 +575,28 @@ def check_logcdf( err_msg=str(pt), ) + def check_selfconsistency_discrete_logcdf( + self, distribution, domain, paramdomains, decimal=None, n_samples=100 + ): + """ + Check that logcdf of discrete distributions matches sum of logps up to value + """ + domains = paramdomains.copy() + domains["value"] = domain + if decimal is None: + decimal = select_by_precision(float64=6, float32=3) + for pt in product(domains, n_samples=n_samples): + params = dict(pt) + value = params.pop("value") + values = np.arange(0, value + 1) + dist = distribution.dist(**params) + assert_almost_equal( + dist.logcdf(value).tag.test_value, + logsumexp(dist.logp(values), keepdims=False).tag.test_value, + decimal=decimal, + err_msg=str(pt), + ) + def check_int_to_1(self, model, value, domain, paramdomains): pdf = model.fastfn(exp(model.logpt)) for pt in product(paramdomains, n_samples=10): @@ -642,6 +664,18 @@ def test_discrete_unif(self): {"lower": -Rplusdunif, "upper": Rplusdunif}, lambda value, lower, upper: sp.randint.logpmf(value, lower, upper + 1), ) + self.check_logcdf( + DiscreteUniform, + Rdunif, + {"lower": -Rplusdunif, "upper": Rplusdunif}, + lambda value, lower, upper: sp.randint.logcdf(value, lower, upper + 1), + ) + self.check_selfconsistency_discrete_logcdf( + DiscreteUniform, + Rdunif, + # Using lower = Bool, as this unittest assumes the distribution domain starts at zero. + {"lower": Bool, "upper": Rplusdunif}, + ) def test_flat(self): self.pymc3_matches_scipy(Flat, Runif, {}, lambda value: 0) @@ -801,32 +835,95 @@ def test_exponential(self): def test_geometric(self): self.pymc3_matches_scipy( - Geometric, Nat, {"p": Unit}, lambda value, p: np.log(sp.geom.pmf(value, p)) + Geometric, + Nat, + {"p": Unit}, + lambda value, p: np.log(sp.geom.pmf(value, p)), + ) + self.check_logcdf( + Geometric, + Nat, + {"p": Unit}, + lambda value, p: sp.geom.logcdf(value, p), + ) + self.check_selfconsistency_discrete_logcdf( + Geometric, + Nat, + {"p": Unit}, ) def test_hypergeometric(self): def modified_scipy_hypergeom_logpmf(value, N, k, n): + # Convert nan to -np.inf original_res = sp.hypergeom.logpmf(value, N, k, n) return original_res if not np.isnan(original_res) else -np.inf + def modified_scipy_hypergeom_logcdf(value, N, k, n): + # Convert nan to -np.inf + original_res = sp.hypergeom.logcdf(value, N, k, n) + + # Correct for scipy bug in logcdf method (see https://github.com/scipy/scipy/issues/13280) + if not np.isnan(original_res): + pmfs = sp.hypergeom.logpmf(np.arange(value + 1), N, k, n) + if np.all(np.isnan(pmfs)): + original_res = np.nan + + return original_res if not np.isnan(original_res) else -np.inf + self.pymc3_matches_scipy( HyperGeometric, Nat, {"N": NatSmall, "k": NatSmall, "n": NatSmall}, - lambda value, N, k, n: modified_scipy_hypergeom_logpmf(value, N, k, n), + modified_scipy_hypergeom_logpmf, + ) + self.check_logcdf( + HyperGeometric, + Nat, + {"N": NatSmall, "k": NatSmall, "n": NatSmall}, + modified_scipy_hypergeom_logcdf, + ) + self.check_selfconsistency_discrete_logcdf( + HyperGeometric, + Nat, + {"N": NatSmall, "k": NatSmall, "n": NatSmall}, ) def test_negative_binomial(self): - def test_fun(value, mu, alpha): + def scipy_mu_alpha_logpmf(value, mu, alpha): return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha)) - self.pymc3_matches_scipy(NegativeBinomial, Nat, {"mu": Rplus, "alpha": Rplus}, test_fun) + def scipy_mu_alpha_logcdf(value, mu, alpha): + return sp.nbinom.logcdf(value, alpha, 1 - mu / (mu + alpha)) + + self.pymc3_matches_scipy( + NegativeBinomial, + Nat, + {"mu": Rplus, "alpha": Rplus}, + scipy_mu_alpha_logpmf, + ) self.pymc3_matches_scipy( NegativeBinomial, Nat, {"p": Unit, "n": Rplus}, lambda value, p, n: sp.nbinom.logpmf(value, n, p), ) + self.check_logcdf( + NegativeBinomial, + Nat, + {"mu": Rplus, "alpha": Rplus}, + scipy_mu_alpha_logcdf, + ) + self.check_logcdf( + NegativeBinomial, + Nat, + {"p": Unit, "n": Rplus}, + lambda value, p, n: sp.nbinom.logcdf(value, n, p), + ) + self.check_selfconsistency_discrete_logcdf( + NegativeBinomial, + Nat, + {"mu": Rplus, "alpha": Rplus}, + ) @pytest.mark.parametrize( "mu, p, alpha, n, expected", @@ -1024,11 +1121,43 @@ def test_binomial(self): {"n": NatSmall, "p": Unit}, lambda value, n, p: sp.binom.logpmf(value, n, p), ) + self.check_logcdf( + Binomial, + Nat, + {"n": NatSmall, "p": Unit}, + lambda value, n, p: sp.binom.logcdf(value, n, p), + ) + self.check_selfconsistency_discrete_logcdf( + Binomial, + Nat, + {"n": NatSmall, "p": Unit}, + ) # Too lazy to propagate decimal parameter through the whole chain of deps @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32") def test_beta_binomial(self): - self.checkd(BetaBinomial, Nat, {"alpha": Rplus, "beta": Rplus, "n": NatSmall}) + self.checkd( + BetaBinomial, + Nat, + {"alpha": Rplus, "beta": Rplus, "n": NatSmall}, + ) + self.pymc3_matches_scipy( + BetaBinomial, + Nat, + {"alpha": Rplus, "beta": Rplus, "n": NatSmall}, + lambda value, alpha, beta, n: sp.betabinom.logpmf(value, a=alpha, b=beta, n=n), + ) + self.check_logcdf( + BetaBinomial, + Nat, + {"alpha": Rplus, "beta": Rplus, "n": NatSmall}, + lambda value, alpha, beta, n: sp.betabinom.logcdf(value, a=alpha, b=beta, n=n), + ) + self.check_selfconsistency_discrete_logcdf( + BetaBinomial, + Nat, + {"alpha": Rplus, "beta": Rplus, "n": NatSmall}, + ) def test_bernoulli(self): self.pymc3_matches_scipy( @@ -1038,7 +1167,27 @@ def test_bernoulli(self): lambda value, logit_p: sp.bernoulli.logpmf(value, scipy.special.expit(logit_p)), ) self.pymc3_matches_scipy( - Bernoulli, Bool, {"p": Unit}, lambda value, p: sp.bernoulli.logpmf(value, p) + Bernoulli, + Bool, + {"p": Unit}, + lambda value, p: sp.bernoulli.logpmf(value, p), + ) + self.check_logcdf( + Bernoulli, + Bool, + {"p": Unit}, + lambda value, p: sp.bernoulli.logcdf(value, p), + ) + self.check_logcdf( + Bernoulli, + Bool, + {"logit_p": R}, + lambda value, logit_p: sp.bernoulli.logcdf(value, scipy.special.expit(logit_p)), + ) + self.check_selfconsistency_discrete_logcdf( + Bernoulli, + Bool, + {"p": Unit}, ) def test_discrete_weibull(self): @@ -1048,10 +1197,29 @@ def test_discrete_weibull(self): {"q": Unit, "beta": Rplusdunif}, discrete_weibull_logpmf, ) + self.check_selfconsistency_discrete_logcdf( + DiscreteWeibull, + Nat, + {"q": Unit, "beta": Rplusdunif}, + ) def test_poisson(self): self.pymc3_matches_scipy( - Poisson, Nat, {"mu": Rplus}, lambda value, mu: sp.poisson.logpmf(value, mu) + Poisson, + Nat, + {"mu": Rplus}, + lambda value, mu: sp.poisson.logpmf(value, mu), + ) + self.check_logcdf( + Poisson, + Nat, + {"mu": Rplus}, + lambda value, mu: sp.poisson.logcdf(value, mu), + ) + self.check_selfconsistency_discrete_logcdf( + Poisson, + Nat, + {"mu": Rplus}, ) def test_bound_poisson(self): @@ -1073,7 +1241,16 @@ def test_constantdist(self): # Too lazy to propagate decimal parameter through the whole chain of deps @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32") def test_zeroinflatedpoisson(self): - self.checkd(ZeroInflatedPoisson, Nat, {"theta": Rplus, "psi": Unit}) + self.checkd( + ZeroInflatedPoisson, + Nat, + {"theta": Rplus, "psi": Unit}, + ) + self.check_selfconsistency_discrete_logcdf( + ZeroInflatedPoisson, + Nat, + {"theta": Rplus, "psi": Unit}, + ) # Too lazy to propagate decimal parameter through the whole chain of deps @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32") @@ -1083,11 +1260,25 @@ def test_zeroinflatednegativebinomial(self): Nat, {"mu": Rplusbig, "alpha": Rplusbig, "psi": Unit}, ) + self.check_selfconsistency_discrete_logcdf( + ZeroInflatedNegativeBinomial, + Nat, + {"mu": Rplusbig, "alpha": Rplusbig, "psi": Unit}, + ) # Too lazy to propagate decimal parameter through the whole chain of deps @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32") def test_zeroinflatedbinomial(self): - self.checkd(ZeroInflatedBinomial, Nat, {"n": NatSmall, "p": Unit, "psi": Unit}) + self.checkd( + ZeroInflatedBinomial, + Nat, + {"n": NatSmall, "p": Unit, "psi": Unit}, + ) + self.check_selfconsistency_discrete_logcdf( + ZeroInflatedBinomial, + Nat, + {"n": NatSmall, "p": Unit, "psi": Unit}, + ) @pytest.mark.parametrize("n", [1, 2, 3]) def test_mvnormal(self, n): From 0583d8eb21e38abc92c545f78b6f558750d15efa Mon Sep 17 00:00:00 2001 From: Ricardo Date: Mon, 28 Dec 2020 12:27:02 +0100 Subject: [PATCH 2/9] Add release note --- RELEASE-NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 2cbff44fc3..f755ec145f 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -19,6 +19,7 @@ It also brings some dreadfully awaited fixes, so be sure to go through the chang - Option to set `check_bounds=False` when instantiating `pymc3.Model()`. This turns off bounds checks that ensure that input parameters of distributions are valid. For correctly specified models, this is unneccessary as all parameters get automatically transformed so that all values are valid. Turning this off should lead to faster sampling (see [#4377](https://github.com/pymc-devs/pymc3/pull/4377)). - `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)). - `plot_posterior_predictive_glm` now works with `arviz.InferenceData` as well (see [#4234](https://github.com/pymc-devs/pymc3/pull/4234)) +- Add `logcdf` method to all univariate discrete distributions (see [#4387](https://github.com/pymc-devs/pymc3/pull/4387)). ### Maintenance - Fixed bug whereby partial traces returns after keyboard interrupt during parallel sampling had fewer draws than would've been available [#4318](https://github.com/pymc-devs/pymc3/pull/4318) From a09d93bd9d20c422ccb342a267a201edd6cf2c28 Mon Sep 17 00:00:00 2001 From: Ricardo Date: Mon, 28 Dec 2020 13:30:50 +0100 Subject: [PATCH 3/9] Small reformatting --- pymc3/distributions/discrete.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 786a55b27d..77e10a1a8c 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -332,7 +332,9 @@ def logcdf(self, value): tt.lt(value, 0), -np.inf, tt.switch( - tt.lt(value, n), logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), 0 + tt.lt(value, n), + logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), + 0, ), ) From 831afbaf8ebf1356cd19bea1b811efc925f3936b Mon Sep 17 00:00:00 2001 From: Ricardo Date: Mon, 28 Dec 2020 14:12:42 +0100 Subject: [PATCH 4/9] Add more comprehensive bound check for impossible parameters --- pymc3/distributions/discrete.py | 83 +++++++++++++++++---------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 77e10a1a8c..5e09e8fd3a 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -168,15 +168,16 @@ def logcdf(self, value): p = self.p value = tt.floor(value) - return tt.switch( - tt.lt(value, n), - bound( + return bound( + tt.switch( + tt.lt(value, n), tt.log(incomplete_beta(n - value, value + 1, 1 - p)), - n > 0, - 0 <= p, - p <= 1, + 0, ), - 0, + 0 <= value, + 0 < n, + 0 <= p, + p <= 1, ) @@ -326,16 +327,19 @@ def logcdf(self, value): TensorVariable """ # TODO: fix for multiple values + alpha = self.alpha + beta = self.beta n = self.n - return tt.switch( - tt.lt(value, 0), - -np.inf, + return bound( tt.switch( tt.lt(value, n), logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), 0, ), + 0 <= value, + 0 < alpha, + 0 < beta, ) @@ -452,14 +456,15 @@ def logcdf(self, value): """ p = self.p - return tt.switch( - tt.lt(value, 0), - -np.inf, + return bound( tt.switch( tt.lt(value, 1), - tt.log(1 - p), + tt.log1p(-p), 0, ), + 0 <= value, + 0 <= p, + p <= 1, ) def _distr_parameters_for_repr(self): @@ -589,15 +594,12 @@ def logcdf(self, value): q = self.q beta = self.beta - return tt.switch( - tt.lt(value, 0), - -np.inf, - bound( - tt.log(1 - tt.power(q, tt.power(value + 1, beta))), - 0 < q, - q < 1, - 0 < beta, - ), + return bound( + tt.log(1 - tt.power(q, tt.power(value + 1, beta))), + 0 <= value, + 0 < q, + q < 1, + 0 < beta, ) @@ -707,8 +709,8 @@ def logcdf(self, value): return bound( tt.log(tt.gammaincc(value + 1, mu)), - value >= 0, - mu >= 0, + 0 <= value, + 0 <= mu, ) @@ -888,7 +890,8 @@ def logcdf(self, value): return bound( tt.log(incomplete_beta(alpha, tt.floor(value) + 1, p)), - alpha > 0, + 0 <= value, + 0 < alpha, 0 <= p, p <= 1, ) @@ -993,7 +996,7 @@ def logcdf(self, value): return bound( log1mexp(-tt.log1p(-p) * value), - value >= 0, + 0 <= value, 0 <= p, p <= 1, ) @@ -1126,25 +1129,23 @@ def logcdf(self, value): TensorVariable """ # TODO: fix for multiple values + # TODO: Use lower upper in locgdf for smarter logsumexp? N = self.N n = self.n k = self.k - return tt.switch( - tt.lt(value, 0), - -np.inf, + return bound( tt.switch( tt.lt(value, n), logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), - bound( - 0, - N > 0, - k >= 0, - n >= 0, - k <= N, - n <= N, - ), + 0, ), + 0 <= value, + 0 < N, + 0 <= k, + 0 <= n, + k <= N, + n <= N, ) @@ -1253,7 +1254,11 @@ def logcdf(self, value): upper = self.upper lower = self.lower - return tt.log(tt.minimum(tt.floor(value), upper) - lower + 1) - tt.log(upper - lower + 1) + return tt.switch( + tt.lt(upper, lower), + -np.inf, + tt.log(tt.minimum(tt.floor(value), upper) - lower + 1) - tt.log(upper - lower + 1), + ) class Categorical(Discrete): From 450a6f78d4a2b62d5eddef66aa54ebc00bc0aa26 Mon Sep 17 00:00:00 2001 From: Ricardo Date: Mon, 28 Dec 2020 16:23:34 +0100 Subject: [PATCH 5/9] Fix bounds for logcdf of DiscreteUniform --- pymc3/distributions/discrete.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 5e09e8fd3a..c5809464ae 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -1254,10 +1254,10 @@ def logcdf(self, value): upper = self.upper lower = self.lower - return tt.switch( - tt.lt(upper, lower), - -np.inf, + return bound( tt.log(tt.minimum(tt.floor(value), upper) - lower + 1) - tt.log(upper - lower + 1), + lower <= value, + lower <= upper, ) From 2c3c2a0ce3a8d35355f4513b4e1846ab4e997c84 Mon Sep 17 00:00:00 2001 From: Ricardo Date: Tue, 29 Dec 2020 13:22:13 +0100 Subject: [PATCH 6/9] Add safe values/ parameters for `BetaBinomial`, `Poisson`, and `HyperGeometric` to avoid errors when evaluating negative logcdfs. --- pymc3/distributions/discrete.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index c5809464ae..705e98e99b 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -330,11 +330,12 @@ def logcdf(self, value): alpha = self.alpha beta = self.beta n = self.n + safe_lower = tt.switch(tt.lt(value, 0), value, 0) return bound( tt.switch( tt.lt(value, n), - logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), + logsumexp(self.logp(tt.arange(safe_lower, value + 1)), keepdims=False), 0, ), 0 <= value, @@ -706,9 +707,12 @@ def logcdf(self, value): """ mu = self.mu value = tt.floor(value) + # To avoid issue with #4340 + safe_mu = tt.switch(tt.lt(mu, 0), 0, mu) + safe_value = tt.switch(tt.lt(value, 0), 0, value) return bound( - tt.log(tt.gammaincc(value + 1, mu)), + tt.log(tt.gammaincc(safe_value + 1, safe_mu)), 0 <= value, 0 <= mu, ) @@ -1133,11 +1137,12 @@ def logcdf(self, value): N = self.N n = self.n k = self.k + safe_lower = tt.switch(tt.lt(value, 0), value, 0) return bound( tt.switch( tt.lt(value, n), - logsumexp(self.logp(tt.arange(0, value + 1)), keepdims=False), + logsumexp(self.logp(tt.arange(safe_lower, value + 1)), keepdims=False), 0, ), 0 <= value, From 670a65292bc6bad424b836ec9b13d2c2218810df Mon Sep 17 00:00:00 2001 From: Ricardo Date: Tue, 29 Dec 2020 14:08:47 +0100 Subject: [PATCH 7/9] Change `check_selfconsistency_discrete_logcdf` to work with negative domains, such as the DiscreteUniform. --- pymc3/tests/test_distributions.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 1005b63c80..9a21e00c15 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -588,7 +588,7 @@ def check_selfconsistency_discrete_logcdf( for pt in product(domains, n_samples=n_samples): params = dict(pt) value = params.pop("value") - values = np.arange(0, value + 1) + values = np.arange(domain.lower, value + 1) dist = distribution.dist(**params) assert_almost_equal( dist.logcdf(value).tag.test_value, @@ -673,8 +673,7 @@ def test_discrete_unif(self): self.check_selfconsistency_discrete_logcdf( DiscreteUniform, Rdunif, - # Using lower = Bool, as this unittest assumes the distribution domain starts at zero. - {"lower": Bool, "upper": Rplusdunif}, + {"lower": -Rplusdunif, "upper": Rplusdunif}, ) def test_flat(self): From bcf87a2656c54eb95b454e5f0773f941007f38c7 Mon Sep 17 00:00:00 2001 From: Ricardo Date: Thu, 31 Dec 2020 11:29:45 +0100 Subject: [PATCH 8/9] Raise TypeError in logcdf methods that only accept scalar values. Fix docstring formatting. --- pymc3/distributions/discrete.py | 67 ++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 705e98e99b..b665a86597 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -163,7 +163,14 @@ def logcdf(self, value): ------- TensorVariable """ - # TODO: work with multiple values (see #4342) + # incomplete_beta function can only handle scalar values (see #4342) + if np.ndim(value): + raise TypeError( + "Binomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) + n = self.n p = self.p value = tt.floor(value) @@ -326,7 +333,14 @@ def logcdf(self, value): ------- TensorVariable """ - # TODO: fix for multiple values + # logcdf can only handle scalar values at the moment + if np.ndim(value): + raise TypeError( + "BetaBinomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) + alpha = self.alpha beta = self.beta n = self.n @@ -446,11 +460,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for Bernoulli distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -583,11 +599,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for Discrete Weibull distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -696,11 +714,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for Poisson distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -887,7 +907,14 @@ def logcdf(self, value): ------- TensorVariable """ - # TODO: work with multiple values (see #4342) + # incomplete_beta function can only handle scalar values (see #4342) + if np.ndim(value): + raise TypeError( + "NegativeBinomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) + # TODO: avoid `p` recomputation if distribution was defined in terms of `p` alpha = self.alpha p = alpha / (self.mu + alpha) @@ -987,11 +1014,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for Geometric distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -1124,15 +1153,24 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for HyperGeometric distribution at the specified value. + Parameters ---------- value: numeric Value for which log CDF is calculated. + Returns ------- TensorVariable """ - # TODO: fix for multiple values + # logcdf can only handle scalar values at the moment + if np.ndim(value): + raise TypeError( + "BetaBinomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) + # TODO: Use lower upper in locgdf for smarter logsumexp? N = self.N n = self.n @@ -1247,11 +1285,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for Discrete uniform distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -1558,11 +1598,13 @@ def logcdf(self, value): """ Compute the log of the cumulative distribution function for ZeroInflatedPoisson distribution at the specified value. + Parameters ---------- value: numeric Value(s) for which log CDF is calculated. If the log CDF for multiple values are desired the values must be provided in a numpy array or theano tensor. + Returns ------- TensorVariable @@ -1698,7 +1740,14 @@ def logcdf(self, value): ------- TensorVariable """ - # TODO: work with multiple values (see #4342) + # logcdf can only handle scalar values due to limitation in Binomial.logcdf + if np.ndim(value): + raise TypeError( + "ZeroInflatedBinomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) + psi = self.psi return bound( @@ -1861,7 +1910,13 @@ def logcdf(self, value): ------- TensorVariable """ - # TODO: work with multiple values (see #4342) + # logcdf can only handle scalar values due to limitation in NegativeBinomial.logcdf + if np.ndim(value): + raise TypeError( + "ZeroInflatedNegativeBinomial.logcdf expects a scalar value but received a {}-dimensional object.".format( + np.ndim(value) + ) + ) psi = self.psi return bound( From dc4ce4a131e2bbf55ed53805b6b9c40d52dd34ca Mon Sep 17 00:00:00 2001 From: Ricardo Date: Thu, 31 Dec 2020 12:01:29 +0100 Subject: [PATCH 9/9] Small change to logcdf methods of `DiscreteWeibull` and `ZeroInflated` making use of `tt.log1p` and `logaddexp`. More informative comment on workaround for `Poisson.logcdf`. --- pymc3/distributions/discrete.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index b665a86597..d1dc858f9c 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -614,7 +614,7 @@ def logcdf(self, value): beta = self.beta return bound( - tt.log(1 - tt.power(q, tt.power(value + 1, beta))), + tt.log1p(-tt.power(q, tt.power(value + 1, beta))), 0 <= value, 0 < q, q < 1, @@ -727,7 +727,7 @@ def logcdf(self, value): """ mu = self.mu value = tt.floor(value) - # To avoid issue with #4340 + # To avoid gammaincc C-assertion when given invalid values (#4340) safe_mu = tt.switch(tt.lt(mu, 0), 0, mu) safe_value = tt.switch(tt.lt(value, 0), 0, value) @@ -1612,7 +1612,7 @@ def logcdf(self, value): psi = self.psi return bound( - tt.log(1 - psi + psi * tt.exp(self.pois.logcdf(value))), + logaddexp(tt.log1p(-psi), tt.log(psi) + self.pois.logcdf(value)), 0 <= value, 0 <= psi, psi <= 1, @@ -1751,7 +1751,7 @@ def logcdf(self, value): psi = self.psi return bound( - tt.log(1 - psi + psi * tt.exp(self.bin.logcdf(value))), + logaddexp(tt.log1p(-psi), tt.log(psi) + self.bin.logcdf(value)), 0 <= value, 0 <= psi, psi <= 1, @@ -1920,7 +1920,7 @@ def logcdf(self, value): psi = self.psi return bound( - tt.log(1 - psi + psi * tt.exp(self.nb.logcdf(value))), + logaddexp(tt.log1p(-psi), tt.log(psi) + self.nb.logcdf(value)), 0 <= value, 0 <= psi, psi <= 1,