Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized poisson distribution #4775

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/api/distributions/discrete.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Discrete
Bernoulli
DiscreteWeibull
Poisson
GeneralizedPoisson
NegativeBinomial
Constant
ZeroInflatedPoisson
Expand Down
2 changes: 2 additions & 0 deletions pymc/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
Constant,
DiscreteUniform,
DiscreteWeibull,
GeneralizedPoisson,
Geometric,
HyperGeometric,
NegativeBinomial,
Expand Down Expand Up @@ -139,6 +140,7 @@
"BetaBinomial",
"Bernoulli",
"Poisson",
"GeneralizedPoisson",
"NegativeBinomial",
"Constant",
"ZeroInflatedPoisson",
Expand Down
155 changes: 154 additions & 1 deletion pymc/distributions/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from pymc.distributions.logprob import logp
from pymc.distributions.mixture import Mixture
from pymc.distributions.shape_utils import rv_size_is_none
from pymc.math import sigmoid
from pymc.math import log1mexp_numpy, sigmoid
from pymc.vartypes import continuous_types

__all__ = [
Expand All @@ -55,6 +55,7 @@
"Bernoulli",
"DiscreteWeibull",
"Poisson",
"GeneralizedPoisson",
"NegativeBinomial",
"Constant",
"ZeroInflatedPoisson",
Expand Down Expand Up @@ -664,6 +665,158 @@ def logcdf(value, mu):
return check_parameters(res, 0 <= mu, msg="mu >= 0")


class GeneralizedPoissonRV(RandomVariable):
name = "generalized_poisson"
ndim_supp = 0
ndims_params = [0, 0]
dtype = "int64"
_print_name = ("GeneralizedPoisson", "\\operatorname{GeneralizedPoisson}")

@classmethod
def rng_fn(cls, rng, theta, lam, size):

theta = np.asarray(theta)
lam = np.asarray(lam)

if size is not None:
dist_size = size
else:
dist_size = np.broadcast_shapes(theta.shape, lam.shape)

# Inversion algorithm described by Famoye (1997), computed on the log scale for
# numerical stability.
# TODO: Authors recommend mixing this method with the "branching" algorgithm,
# but that one is only valid for positive lambdas. They also mention the normal
# approximation for lambda < 0 and mu >= 10 or 0 < lambda < 0.2 and mu >= 30.
# This could be done by splitting the values depending on lambda and then
# recombining them.

log_u = np.log(rng.uniform(size=dist_size))

pos_lam = lam > 0
with np.errstate(divide="ignore", invalid="ignore"):
mixed_log_lam = np.where(pos_lam, np.log(lam), np.log(-lam))
theta_m_lam = theta - lam

log_s = -theta
log_p = log_s.copy()
x_ = 0
x = np.zeros(dist_size)
below_cutpoint = log_s < log_u
with np.errstate(divide="ignore", invalid="ignore"):
while np.any(below_cutpoint):
x_ += 1
x[below_cutpoint] += 1
log_c = np.log(theta_m_lam + lam * x_)
# Compute log(1 + lam / C)
log1p_lam_m_C = np.where(
pos_lam,
np.log1p(np.exp(mixed_log_lam - log_c)),
log1mexp_numpy(mixed_log_lam - log_c, negative_input=True),
)
log_p = log_c + log1p_lam_m_C * (x_ - 1) + log_p - np.log(x_) - lam
log_s = np.logaddexp(log_s, log_p)
below_cutpoint = log_s < log_u
return x


generalized_poisson = GeneralizedPoissonRV()


class GeneralizedPoisson(Discrete):
R"""
Generalized Poisson log-likelihood.

Used to model count data that can be either overdispersed or underdispersed.
Offers greater flexibility than the standard Poisson which assumes equidispersion,
where the mean is equal to the variance.

The pmf of this distribution is

.. math:: f(x \mid \mu, \lambda) =
\frac{\mu (\mu + \lambda x)^{x-1} e^{-\mu - \lambda x}}{x!}

======== ======================================
Support :math:`x \in \mathbb{N}_0`
Mean :math:`\frac{\mu}{1 - \lambda}`
Variance :math:`\frac{\mu}{(1 - \lambda)^3}`
======== ======================================

Parameters
----------
mu : tensor_like of float
Mean parameter (mu > 0).
lam : tensor_like of float
Dispersion parameter (max(-1, -mu/4) <= lam <= 1).

Notes
-----
When lam = 0, the Generalized Poisson reduces to the standard Poisson with the same mu.
When lam < 0, the mean is greater than the variance (underdispersion).
When lam > 0, the mean is less than the variance (overdispersion).

References
----------
The PMF is taken from [1] and the random generator function is adapted from [2].

.. [1] Consul, PoC, and Felix Famoye. "Generalized Poisson regression model."
Communications in Statistics-Theory and Methods 21.1 (1992): 89-109.

.. [2] Famoye, Felix. "Generalized Poisson random variate generation." American
Journal of Mathematical and Management Sciences 17.3-4 (1997): 219-237.

"""
rv_op = generalized_poisson

@classmethod
def dist(cls, mu, lam, **kwargs):
mu = at.as_tensor_variable(floatX(mu))
lam = at.as_tensor_variable(floatX(lam))
return super().dist([mu, lam], **kwargs)

def moment(rv, size, mu, lam):
mean = at.floor(mu / (1 - lam))
if not rv_size_is_none(size):
mean = at.full(size, mean)
return mean

def logp(value, mu, lam):
r"""
Calculate log-probability of Generalized Poisson distribution at specified value.

Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or Aesara tensor

Returns
-------
TensorVariable
"""
mu_lam_value = mu + lam * value
logprob = np.log(mu) + logpow(mu_lam_value, value - 1) - mu_lam_value - factln(value)

# Probability is 0 when value > m, where m is the largest positive integer for
# which mu + m * lam > 0 (when lam < 0).
logprob = at.switch(
at.or_(
mu_lam_value < 0,
value < 0,
),
-np.inf,
logprob,
)

return check_parameters(
logprob,
0 < mu,
at.abs(lam) <= 1,
(-mu / 4) <= lam,
msg="0 < mu, max(-1, -mu/4)) <= lam <= 1",
)


class NegativeBinomial(Discrete):
R"""
Negative binomial log-likelihood.
Expand Down
48 changes: 48 additions & 0 deletions pymc/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def polyagamma_cdf(*args, **kwargs):
Exponential,
Flat,
Gamma,
GeneralizedPoisson,
Geometric,
Gumbel,
HalfCauchy,
Expand Down Expand Up @@ -1742,6 +1743,53 @@ def test_poisson(self):
{"mu": Rplus},
)

def test_generalized_poisson(self):
# We are only checking this distribution for lambda=0 where it's equivalent to Poisson.
self.check_logp(
GeneralizedPoisson,
Nat,
{"mu": Rplus, "lam": Domain([0], edges=(None, None))},
lambda value, mu, lam: sp.poisson.logpmf(value, mu),
skip_paramdomain_outside_edge_test=True,
)

value = at.scalar("value")
mu = at.scalar("mu")
lam = at.scalar("lam")
logp = pm.logp(GeneralizedPoisson.dist(mu, lam), value)
logp_fn = aesara.function([value, mu, lam], logp)

# Check out-of-bounds values
logp_fn(-1, mu=5, lam=0) == -np.inf
logp_fn(9, mu=5, lam=-1) == -np.inf

# Check mu/lam restrictions
with pytest.raises(ParameterValueError):
logp_fn(1, mu=1, lam=2)

with pytest.raises(ParameterValueError):
logp_fn(1, mu=0, lam=0)

with pytest.raises(ParameterValueError):
logp_fn(1, mu=1, lam=-1)

def test_generalized_poisson_lam_expected_moments(self):
# TODO: This is a costly test, we should find alternative to test logp
mu = 30
lam = np.array([-0.9, -0.7, -0.2, 0, 0.2, 0.7, 0.9])
with Model():
# We create separate dists, because the Metropolis sampler cannot find
# a good single step size to accomodate all lambda values
dist = [pm.GeneralizedPoisson(f"x_{l}", mu=mu, lam=l) for l in lam]
pm.Deterministic("x", at.stack(dist))
trace = pm.sample(return_inferencedata=False, chains=1, draws=10_000)

expected_mean = mu / (1 - lam)
np.testing.assert_allclose(trace["x"].mean(0), expected_mean, rtol=1e-1)

expected_std = np.sqrt(mu / (1 - lam) ** 3)
np.testing.assert_allclose(trace["x"].std(0), expected_std, rtol=1e-1)

def test_constantdist(self):
self.check_logp(Constant, I, {"c": I}, lambda value, c: np.log(c == value))
self.check_logcdf(Constant, I, {"c": I}, lambda value, c: np.log(value >= c))
Expand Down
14 changes: 14 additions & 0 deletions pymc/tests/test_distributions_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Exponential,
Flat,
Gamma,
GeneralizedPoisson,
Geometric,
Gumbel,
HalfCauchy,
Expand Down Expand Up @@ -592,6 +593,19 @@ def test_poisson_moment(mu, size, expected):
assert_moment_is_expected(model, expected)


@pytest.mark.parametrize(
"mu, lam, size, expected",
[
(50, [-0.6, 0, 0.6], None, np.floor(50 / (1 - np.array([-0.6, 0, 0.6])))),
([5, 50], -0.1, (4, 2), np.full((4, 2), np.floor(np.array([5, 50]) / 1.1))),
],
)
def test_generalized_poisson_moment(mu, lam, size, expected):
with Model() as model:
GeneralizedPoisson("x", mu=mu, lam=lam, size=size)
assert_moment_is_expected(model, expected)


@pytest.mark.parametrize(
"n, p, size, expected",
[
Expand Down
49 changes: 39 additions & 10 deletions pymc/tests/test_distributions_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def random_polyagamma(*args, **kwargs):

import pymc as pm

from pymc.aesaraf import change_rv_size, floatX, intX
from pymc.aesaraf import change_rv_size, compile_pymc, floatX, intX
from pymc.distributions.continuous import get_tau_sigma, interpolated
from pymc.distributions.discrete import _OrderedLogistic, _OrderedProbit
from pymc.distributions.dist_math import clipped_beta_rvs
Expand Down Expand Up @@ -84,7 +84,7 @@ def pymc_random(

model, param_vars = build_model(dist, valuedomain, paramdomains, extra_args)
model_dist = change_rv_size_fn(model.named_vars["value"], size, expand=True)
pymc_rand = aesara.function([], model_dist)
pymc_rand = compile_pymc([], model_dist)

domains = paramdomains.copy()
for pt in product(domains, n_samples=100):
Expand Down Expand Up @@ -123,7 +123,7 @@ def pymc_random_discrete(

model, param_vars = build_model(dist, valuedomain, paramdomains)
model_dist = change_rv_size(model.named_vars["value"], size, expand=True)
pymc_rand = aesara.function([], model_dist)
pymc_rand = compile_pymc([], model_dist)

domains = paramdomains.copy()
for pt in product(domains, n_samples=100):
Expand All @@ -144,15 +144,14 @@ def pymc_random_discrete(
e = intX(ref_rand(size=size, **pt))
o = np.atleast_1d(o).flatten()
e = np.atleast_1d(e).flatten()
observed = dict(zip(*np.unique(o, return_counts=True)))
expected = dict(zip(*np.unique(e, return_counts=True)))
for e in expected.keys():
expected[e] = (observed.get(e, 0), expected[e])
k = np.array([v for v in expected.values()])
if np.all(k[:, 0] == k[:, 1]):
bins = min(20, max(len(set(e)), len(set(o))))
range = (min(min(e), min(o)), max(max(e), max(o)))
observed, _ = np.histogram(o, bins=bins, range=range)
expected, _ = np.histogram(e, bins=bins, range=range)
if np.all(observed == expected):
p = 1.0
else:
_, p = st.chisquare(k[:, 0], k[:, 1])
_, p = st.chisquare(observed + 1, expected + 1)
f -= 1
assert p > alpha, str(pt)

Expand Down Expand Up @@ -973,6 +972,36 @@ class TestPoisson(BaseTestDistributionRandom):
checks_to_run = ["check_pymc_params_match_rv_op"]


class TestGeneralizedPoisson(BaseTestDistributionRandom):
pymc_dist = pm.GeneralizedPoisson
pymc_dist_params = {"mu": 4.0, "lam": 1.0}
expected_rv_op_params = {"mu": 4.0, "lam": 1.0}
tests_to_run = [
"check_pymc_params_match_rv_op",
"check_rv_size",
]

def test_random(self):
pymc_random_discrete(
dist=self.pymc_dist,
paramdomains={"mu": Rplus, "lam": Domain([0], edges=(None, None))},
ref_rand=lambda mu, lam, size: st.poisson.rvs(mu, size=size),
)

def test_random_lam_expected_moments(self):
mu = 50
lam = np.array([-0.9, -0.7, -0.2, 0, 0.2, 0.7, 0.9])

dist = self.pymc_dist.dist(mu=mu, lam=lam, size=(10_000, len(lam)))
draws = dist.eval()

expected_mean = mu / (1 - lam)
np.testing.assert_allclose(draws.mean(0), expected_mean, rtol=1e-1)

expected_std = np.sqrt(mu / (1 - lam) ** 3)
np.testing.assert_allclose(draws.std(0), expected_std, rtol=1e-1)


class TestMvNormalCov(BaseTestDistributionRandom):
pymc_dist = pm.MvNormal
pymc_dist_params = {
Expand Down