-
Notifications
You must be signed in to change notification settings - Fork 430
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add more Kolmogorov–Smirnov test #1504
Changes from 7 commits
d4334d0
2244c8d
d4f9226
b347cb0
78f6d0d
6c932dc
dce3073
48b6156
10c19b0
d22304d
c71b3ff
02d05e9
2bf4dad
0467655
e67f802
a8e3596
88e58bc
dbfda2b
867f58f
b4148cc
102411d
ac7e19a
5b32931
436ab97
771e79b
7ca4e29
b359a58
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,8 +11,7 @@ use core::f64; | |
use num_traits::AsPrimitive; | ||
use rand::SeedableRng; | ||
use rand_distr::{Distribution, Normal}; | ||
use special::Beta; | ||
use special::Primitive; | ||
use special::{Beta, Gamma, Primitive}; | ||
|
||
// [1] Nonparametric Goodness-of-Fit Tests for Discrete Null Distributions | ||
// by Taylor B. Arnold and John W. Emerson | ||
|
@@ -159,6 +158,186 @@ fn normal() { | |
} | ||
} | ||
|
||
#[test] | ||
fn cauchy() { | ||
fn cdf(x: f64, median: f64, scale: f64) -> f64 { | ||
(1.0 / f64::consts::PI) * ((x - median) / scale).atan() + 0.5 | ||
} | ||
|
||
let parameters = [ | ||
(0.0, 1.0), | ||
(0.0, 0.1), | ||
(1.0, 10.0), | ||
(1.0, 100.0), | ||
(-1.0, 0.00001), | ||
(-1.0, 0.0000001), | ||
]; | ||
|
||
for (seed, (median, scale)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Cauchy::new(median, scale).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, median, scale)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn uniform() { | ||
fn cdf(x: f64, a: f64, b: f64) -> f64 { | ||
if x < a { | ||
0.0 | ||
} else if x < b { | ||
(x - a) / (b - a) | ||
} else { | ||
1.0 | ||
} | ||
} | ||
|
||
let parameters = [(0.0, 1.0), (-1.0, 1.0), (0.0, 100.0), (-100.0, 100.0)]; | ||
|
||
for (seed, (a, b)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Uniform::new(a, b).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, a, b)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn log_normal() { | ||
fn cdf(x: f64, mean: f64, std_dev: f64) -> f64 { | ||
if x <= 0.0 { | ||
0.0 | ||
} else if x.is_infinite() { | ||
1.0 | ||
} else { | ||
0.5 * ((mean - x.ln()) / (std_dev * f64::consts::SQRT_2)).erfc() | ||
} | ||
} | ||
|
||
let parameters = [ | ||
(0.0, 1.0), | ||
(0.0, 0.1), | ||
(0.5, 0.7), | ||
(1.0, 10.0), | ||
(1.0, 100.0), | ||
]; | ||
|
||
println!("{}", cdf(5.0, 2.0, 1.5)); | ||
|
||
for (seed, (mean, std_dev)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::LogNormal::new(mean, std_dev).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, mean, std_dev)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn exp() { | ||
fn cdf(x: f64, lambda: f64) -> f64 { | ||
1.0 - (-lambda * x).exp() | ||
} | ||
|
||
let parameters = [0.5, 1.0, 7.5, 32.0, 100.0]; | ||
|
||
for (seed, lambda) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Exp::new(lambda).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, lambda)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn weibull() { | ||
fn cdf(x: f64, lambda: f64, k: f64) -> f64 { | ||
if x < 0.0 { | ||
return 0.0; | ||
} | ||
|
||
1.0 - (-(x / lambda).powf(k)).exp() | ||
} | ||
|
||
let parameters = [ | ||
(0.5, 1.0), | ||
(1.0, 1.0), | ||
(10.0, 0.1), | ||
(0.1, 10.0), | ||
(15.0, 20.0), | ||
// (1000.0, 0.001), // Fail case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case does not really make sense because of the small |
||
]; | ||
|
||
for (seed, (lambda, k)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Weibull::new(lambda, k).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, lambda, k)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn gamma() { | ||
fn cdf(x: f64, shape: f64, scale: f64) -> f64 { | ||
if x < 0.0 { | ||
return 0.0; | ||
} | ||
|
||
(x / scale).inc_gamma(shape) | ||
} | ||
|
||
let parameters = [ | ||
(0.5, 2.0), | ||
(1.0, 1.0), | ||
(10.0, 0.1), | ||
(100.0, 0.0001), | ||
(0.9999, 2.0), | ||
]; | ||
|
||
for (seed, (shape, scale)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Gamma::new(shape, scale).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, shape, scale)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn chi_squared() { | ||
fn cdf(x: f64, k: f64) -> f64 { | ||
if x < 0.0 { | ||
return 0.0; | ||
} | ||
|
||
(x / 2.0).inc_gamma(k / 2.0) | ||
} | ||
|
||
let parameters = [ | ||
// 0.01, // Fail case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case should also be fine to drop (for now). The distribution has very high Kurtosis, so there will a significant amount of mass in the tails. |
||
0.1, 1.0, 2.0, 10.0, 100.0, 1000.0, | ||
]; | ||
|
||
for (seed, k) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::ChiSquared::new(k).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, k)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn beta() { | ||
fn cdf(x: f64, alpha: f64, beta: f64) -> f64 { | ||
if x < 0.0 { | ||
return 0.0; | ||
} | ||
if x > 1.0 { | ||
return 1.0; | ||
} | ||
let ln_beta_ab = alpha.ln_beta(beta); | ||
x.inc_beta(alpha, beta, ln_beta_ab) | ||
} | ||
|
||
let parameters = [ | ||
(0.5, 0.5), | ||
(2.0, 3.5), | ||
(10.0, 1.0), | ||
(100.0, 50.0), | ||
// (10.0, 0.1), // Fail case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also a very extreme distribution concentrated around |
||
]; | ||
|
||
for (seed, (alpha, beta)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Beta::new(alpha, beta).unwrap(); | ||
test_continuous(seed as u64, dist, |x| cdf(x, alpha, beta)); | ||
} | ||
} | ||
|
||
fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 { | ||
if k < 0 { | ||
return 0.0; | ||
|
@@ -195,3 +374,81 @@ fn binomial() { | |
}); | ||
} | ||
} | ||
|
||
#[test] | ||
fn geometric() { | ||
fn cdf(k: i64, p: f64) -> f64 { | ||
if k < 0 { | ||
0.0 | ||
} else { | ||
1.0 - (1.0 - p).powi(1 + k as i32) | ||
} | ||
} | ||
|
||
let parameters = [0.3, 0.5, 0.7, 0.0000001, 0.9999]; | ||
|
||
for (seed, p) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Geometric::new(p).unwrap(); | ||
test_discrete(seed as u64, dist, |k| cdf(k, p)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn hypergeometric() { | ||
fn cdf(x: i64, n: u64, k: u64, n_: u64) -> f64 { | ||
let min = if n_ + k > n { n_ + k - n } else { 0 }; | ||
let max = k.min(n_); | ||
if x < min as i64 { | ||
return 0.0; | ||
} else if x >= max as i64 { | ||
return 1.0; | ||
} | ||
|
||
(min..x as u64 + 1).fold(0.0, |acc, k_| { | ||
acc + (ln_binomial(k, k_) + ln_binomial(n - k, n_ - k_) - ln_binomial(n, n_)).exp() | ||
}) | ||
} | ||
|
||
let parameters = [ | ||
(15, 13, 10), | ||
(25, 15, 5), | ||
(60, 10, 7), | ||
(70, 20, 50), | ||
(100, 50, 10), | ||
// (100, 50, 49), // Fail case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a bug in the hypergeometric sampling, after this was merged this test should be included. |
||
]; | ||
|
||
for (seed, (n, k, n_)) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Hypergeometric::new(n, k, n_).unwrap(); | ||
test_discrete(seed as u64, dist, |x| cdf(x, n, k, n_)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn poisson() { | ||
fn cdf(k: i64, lambda: f64) -> f64 { | ||
if k < 0 || lambda <= 0.0 { | ||
return 0.0; | ||
} | ||
|
||
1.0 - lambda.inc_gamma(k as f64 + 1.0) | ||
} | ||
|
||
let parameters = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make this a test over We have to think about how to test over different floating point, but I would do everything in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't use However, if I move the KS test to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can change the definition of |
||
0.1_f32, 1.0, 7.5, | ||
// 1.844E+19, // fail case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case fails because your cdf does not work for values bigger than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might not be related to the implementation of I will also check other implementations of the CDF to see if there is a possibility of underflow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the latest commit, I copied the println!("{}", cdf(1e9 as i64, 1e9)); // 0.5000077942086617
println!("{}", cdf(2e9 as i64, 1e9)); // 1
println!("{}", cdf((1e9 - 1e4) as i64, 1e9)); // 0.3759229845139799 For reference, here is the calculation in R: > ppois(1e9, 1e9)
[1] 0.5000084
> ppois(2e9, 1e9)
[1] 1
> ppois(1e9-1e4, 1e9)
[1] 0.3759226 |
||
]; | ||
|
||
for (seed, lambda) in parameters.into_iter().enumerate() { | ||
let dist = rand_distr::Poisson::new(lambda).unwrap(); | ||
test_discrete(seed as u64, dist, |k| cdf(k, lambda as f64)); | ||
} | ||
} | ||
|
||
fn ln_factorial(n: u64) -> f64 { | ||
(n as f64 + 1.0).lgamma().0 | ||
} | ||
|
||
fn ln_binomial(n: u64, k: u64) -> f64 { | ||
ln_factorial(n) - ln_factorial(k) - ln_factorial(n - k) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is still a debug print.
By the way there is a
dbg!
macro for this.