diff --git a/NEWS.md b/NEWS.md index 628f82e..91d2db4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,7 @@ BUG FIX - mgedist() may suffer a numerical issue for Anderson-Darling GoF metrics. All GoF metrics now take care of numerical issue, such as log(0) or 1/0, and are properly scaled by the sample sized to avoid large sample size issues. Thanks for Ethan Chapman for reporting the bug. - the default starting value for the gamma distribution was wrongly computed for the rate parameter. Thanks for Wendy Martin for reporting the bug. -- mledist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues. +- mledist(), mmedist(), qmedist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues. # fitdistrplus 1.2-1 diff --git a/R/mmedist.R b/R/mmedist.R index 164a838..d16b9f7 100644 --- a/R/mmedist.R +++ b/R/mmedist.R @@ -386,7 +386,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL, (momemp - momtheo)^2 } fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights) - sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) ) + mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) ) }else { DIFF2 <- function(par, fix.arg, order, obs, mdistnam, memp, weights) @@ -396,7 +396,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL, (momemp - momtheo)^2 } fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights) - sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) ) + mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) ) } cens <- FALSE diff --git a/R/qmedist.R b/R/qmedist.R index f3822eb..45837db 100644 --- a/R/qmedist.R +++ b/R/qmedist.R @@ -133,7 +133,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL, } fnobj <- function(par, fix.arg, obs, qdistnam, qtype) - sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) ) + mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) ) }else if (!cens && !is.null(weights)) { @@ -144,7 +144,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL, (qemp - qtheo)^2 } fnobj <- function(par, fix.arg, obs, qdistnam, qtype) - sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) ) + mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) ) } # Function to calculate the loglikelihood to return diff --git a/tests/t-mledist-nocens.R b/tests/t-mledist-nocens.R index bd42bd0..651f614 100644 --- a/tests/t-mledist-nocens.R +++ b/tests/t-mledist-nocens.R @@ -114,7 +114,7 @@ x <- c(rnorm(1000, 5), rnorm(1000, 10)) #MLE fit fit1 <- mledist(x, "norm2", start=list(w=1/3, m1=4, s1=2, m2=8, s2=2), lower=c(0, 0, 0, 0, 0)) - +fit1 #fitted coef around 0.5003298 4.9842719 0.9909527 10.0296973 1.0024444 , fitted loglik -4185.114 @@ -307,8 +307,6 @@ x <- c(rep(0, n), rpois(n, 10), rpois(n, 50)) mledist(x, "pois", optim.method="Nelder-Mead", control=list(maxit=10)) - - # (15) basic fit of a normal distribution with new fix.arg/start.arg # diff --git a/tests/t-mmedist.R b/tests/t-mmedist.R index aa0b6f9..c87b906 100644 --- a/tests/t-mmedist.R +++ b/tests/t-mmedist.R @@ -8,6 +8,8 @@ set.seed(1234) x1 <- rnorm(n=nsample) mmedist(x1,"norm") +#fitted coef is -0.3831574 0.9446870, fitted loglik is -13.62037 + try(mmedist(x1,"norm", fix.arg=list(mean=0))) # (2) fit a discrete distribution (Poisson) @@ -17,6 +19,8 @@ set.seed(1234) x2 <- rpois(n=nsample, lambda = 2) mmedist(x2,"pois") +#fitted coef is 1.7, fitted loglik is -15.31626 + # (3) fit a finite-support distribution (beta) # @@ -24,7 +28,7 @@ set.seed(1234) x3 <- rbeta(n=nsample, shape1=5, shape2=10) mmedist(x3,"beta") - +#fitted coef is 3.956081 9.512364, fitted loglik is 6.939847 # (4) fit a Pareto distribution # @@ -42,6 +46,8 @@ if(any(installed.packages()[, "Package"] == "actuar")) #fit mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf) + #fitted coef is 127.4319 64.2936, fitted loglik is -3.689438 + mmedist(x4, "pareto", order=1, memp=memp, start=list(shape=10), fix.arg=list(scale=1.5), lower=2, upper=Inf) mmedist(x4, "pareto", order=1, memp=memp, start=function(x) list(shape=10), fix.arg=list(scale=1.5), @@ -78,6 +84,7 @@ s2 <- log(1+var(x3)/mean(x3)^2*(n-1)/n) mu <- log(mean(x3)) - s2/2 cbind(c(mu, s2), f2$estimate) +#fitted coef is -0.1123774 1.1839216, fitted loglik is -12.99279 c(truestim=exp(mu+s2/2), jensen=as.numeric(exp(f1$estimate["meanlog"]+f1$estimate["sdlog"]^2/2)), @@ -124,6 +131,8 @@ fitdistrplus:::wtdvar(x1, w) mmedist(exp(x1), "lnorm", weights=w)$estimate +#fitted coef is -0.4199772 0.6276551, fitted loglik is -26.82328 + #test non integer weights try(mmedist(x1, "norm", weights=rep(1/3, length(x1)))) try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.001), start=list(lambda=mean(x)))) @@ -132,7 +141,7 @@ try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.0000001), start=list(lambda=mea # (8) fit of a neg binom distribution with weighted moment matching estimation # - +set.seed(1234) x4 <- rnbinom(nsample, 5, 1/2) table(x4) w <- rep(1, length(x4)) @@ -141,6 +150,8 @@ w[x4 > 5] <- 2 mmedist(x4, "nbinom", weights=w)$estimate mmedist(x4, "nbinom", weights=NULL)$estimate +#fitted coef is 12.980769 4.615385, fitted loglik is -29.67728 + # (9) relevant example for zero modified geometric distribution # @@ -207,6 +218,9 @@ w[x5 > 5] <- 2 mmedist(x5, "zmgeom", order=1:2, memp=memp1, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])), lower=0.01, upper=0.99)$estimate + +#fitted coef is 0.4528775 0.1402878 , fitted loglik is -26.80729 + mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])), weights=w)$estimate @@ -221,6 +235,7 @@ mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/m if(any(installed.packages()[, "Package"] == "actuar")) { require("actuar") + set.seed(1234) #simulate a sample x4 <- rpareto(nsample, 6, 2) @@ -231,4 +246,25 @@ if(any(installed.packages()[, "Package"] == "actuar")) #fit mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "L-BFGS-B") #L-BFGS-B via optim mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "Nelder") #Nelder Mead via constrOptim + + #fitted coef is 4.294391 1.597783, fitted loglik is -2.173544 } + + +# (11) large sample size issue + +if(FALSE) +{ + set.seed(123) + obs <- rlnorm(1e7, 3, 2) + for(i in 2:7) + cat(i, try(mmedist(obs[1:10^i], "lnorm", control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n") + + # 2 3.795472 1.370517 + # 3 3.569512 1.67947 + # 4 3.241516 1.881192 + # 5 3.179215 1.902745 + # 6 2.993209 2.004211 + # 7 2.996089 2.002622 +} + diff --git a/tests/t-msedist.R b/tests/t-msedist.R index 0020db1..52d74d7 100644 --- a/tests/t-msedist.R +++ b/tests/t-msedist.R @@ -35,11 +35,15 @@ legend("bottomright", lty=1:2, col=c("green", "blue"), leg=c("theoretical value" msedist(x1, "exp", control=list(trace=0, REPORT=1)) +#fitted coef is 1.336924, fitted loglik is -5.610213 + mse_exp <- fitdist(x1, "exp", method="mse") plot(mse_exp) summary(mse_exp) gofstat(mse_exp) +#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195 + mse_exp_boot <- bootdist(mse_exp, niter = nboot) plot(mse_exp_boot) abline(v=1, col="green") @@ -88,6 +92,8 @@ optim(c(2,2), Dn) msedist(x1, "lnorm", control=list(trace=0, REPORT=1)) +#fitted coef is -0.5939281 0.6723368, fitted loglik is -2.380166 + mse_lnorm <- fitdist(x1, "lnorm", method="mse") mle_lnorm <- fitdist(x1, "lnorm", method="mle") plot(mse_lnorm) @@ -113,6 +119,7 @@ optim(c(1,1,10), Dn) msedist(x1, "burr", start=list(shape1=1, shape2=1, rate=10), control=list(trace=0, REPORT=1)) +#fitted coef is 0.609447 3.343173 3.811287, fitted loglik is -3.023078 mse_burr <- fitdist(x1, "burr", method="mse", start=list(shape1=1, shape2=1, rate=10)) mle_burr <- fitdist(x1, "burr", method="mle", start=list(shape1=1, shape2=1, rate=10)) diff --git a/tests/t-qmedist.R b/tests/t-qmedist.R index c2e0f80..3fabbfe 100644 --- a/tests/t-qmedist.R +++ b/tests/t-qmedist.R @@ -8,6 +8,7 @@ set.seed(1234) x1 <- rnorm(n=nsample) qmedist(x1, "norm", probs=c(1/3, 2/3)) +#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195 # (2) defining your own distribution functions, here for the Gumbel # distribution for other distributions, see the CRAN task view dedicated @@ -17,6 +18,8 @@ dgumbel <- function(x, a, b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b)) qgumbel <- function(p, a, b) a - b*log(-log(p)) qmedist(x1, "gumbel", probs=c(1/3, 2/3), start=list(a=10,b=5)) +#fitted coef is -0.4935571 0.8557281, fitted loglik is -16.79828 + # (3) fit a discrete distribution (Poisson) # @@ -24,6 +27,8 @@ set.seed(1234) x2 <- rpois(n=nsample,lambda = 2) qmedist(x2, "pois", probs=1/2) +#fitted coef is 1.7, fitted loglik is -15.31626 + # (4) fit a finite-support distribution (beta) # @@ -31,6 +36,7 @@ set.seed(1234) x3 <- rbeta(n=nsample, shape1=5, shape2=10) qmedist(x3, "beta", probs=c(1/3, 2/3)) +#fitted coef is 4.010999 8.397853, fitted loglik is 6.642124 # (5) fit frequency distributions on USArrests dataset. # @@ -39,6 +45,8 @@ x4 <- USArrests$Assault qmedist(x4, "pois", probs=1/2) qmedist(x4, "nbinom", probs=c(1/3, 2/3)) +#fitted coef is 2.518966 182.313344, fitted loglik is 292.5969 + # (6) normal mixture # @@ -65,7 +73,8 @@ fit2 <- qmedist(x, "norm2", probs=c(1/6, 1/4, 1/3, 1/2, 2/3), start=list(poid=1/3, m1=4, s1=2, m2=8, s2=2), lower=c(0, 0, 0, 0, 0), upper=c(1/2, Inf, Inf, Inf, Inf)) - +fit2 +#fitted coef is 0.3433528 4.2872449 0.3891135 9.2598612 1.7948554, fitted loglik is -38.14106 # (7) test error messages # @@ -106,14 +115,11 @@ x <- rexp(nsample) f3 <- qmedist(x, "exp", probs=1/2) w4 <- c(rep(1, nsample/2), round(sqrt(1:(nsample/2)))) f4 <- qmedist(x, "exp", weights=w4, probs=1/2) -f3$estimate -f4$estimate +c(f3$estimate, f4$estimate) -f3$loglik -f4$loglik +c(f3$loglik, f4$loglik) -median(x) -median(tail(x, 50)) +#fitted coef is 0.4816191, fitted loglik is -16.95355 #try non integer weights @@ -128,3 +134,22 @@ qmedist(x, "norm", probs=1:2/3, control=list(maxit=2), start=list(mean=1e5, sd=1 x <- rnorm(nsample) qmedist(x, "norm", probs=1:2/3, optim.method="L-BFGS-B", lower=c(-Inf, 0)) #via optim qmedist(x, "norm", probs=1:2/3, optim.method="Nelder", lower=c(-Inf, 0)) #via constrOptim + + +# (11) large sample size issue + +if(FALSE) +{ + set.seed(123) + obs <- rlnorm(1e7, 3, 2) + for(i in 2:7) + cat(i, try(qmedist(obs[1:10^i], "lnorm", probs=1:2/3, control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n") + + # 2 3.109257 1.767013 + # 3 3.022615 1.857885 + # 4 2.999376 1.978701 + # 5 3.003344 2.00941 + # 6 2.99881 2.001733 + # 7 2.999859 2.000227 +} +