Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Poor pooled estimates in simple case #15

Open
jeremyrcoyle opened this issue Dec 13, 2017 · 2 comments
Open

Poor pooled estimates in simple case #15

jeremyrcoyle opened this issue Dec 13, 2017 · 2 comments
Labels

Comments

@jeremyrcoyle
Copy link

The following code attempts to fit a marginal density using both pooled and unpooled condensier estimates by way of sl3. The true density is standard normal. The unpooled estimates (red) approximate the true density(blue), but the unpooled estimates (green) do not.

options(sl3.verbose = FALSE)
library("condensier")
library("sl3")
library("simcausal")
library(ggplot2)


D <- DAG.empty()
D <- D + node("I", distr = "rbern", prob = 1) +
     node("B", distr = "rnorm", mean = 0, sd = 1)

D <- set.DAG(D, n.test = 10)
datO <- sim(D, n = 10000, rndseed = 12345)

# ================================================================================
task <- sl3_Task$new(datO, covariates=c("I"),outcome="B")
lrn_unpooled <- Lrnr_condensier$new(nbins = 25, bin_method = "equal.len", pool = FALSE,
                            bin_estimator = Lrnr_glm_fast$new(family = binomial()))
lrn_unpooled_fit <- lrn_unpooled$train(task)

lrn_pooled <- Lrnr_condensier$new(nbins = 25, bin_method = "equal.len", pool = TRUE,
                                    bin_estimator = Lrnr_glm_fast$new(family = binomial()))
lrn_pooled_fit <- lrn_pooled$train(task)


# ================================================================================


# evaluate fit on training data
datO$pred_fB_unpooled <- lrn_unpooled_fit$predict(task)
datO$pred_fB_pooled <- lrn_pooled_fit$predict(task)
datO$fB <- dnorm(datO$B)
long <- melt(datO, id=c("B"), measure=c("pred_fB_unpooled","pred_fB_pooled","fB"))
ggplot(long, aes(x=B, y=value, color=variable)) + geom_point() + theme_bw()

image

@wilsoncai1992
Copy link

I also meet with a case where pool = TRUE tend to lead to weird results, holding other conditions the same.

In my case, densities will not integrate to 1 when I fit a univariate density.

# bimodal data
library("simcausal")
D <- DAG.empty()
D <-
  D + node("W1", distr = "rbern", prob = 0.5) +
  node("sA.mu", distr = "rconst", const = 4*W1 - 2) +
  node("sA", distr = "rnorm", mean = sA.mu, sd = 1)
D <- set.DAG(D, n.test = 10)
datO <- sim(D, n = 1e4, rndseed = 12345)


library("condensier")
library("sl3")
library(origami)
dat_dummy <- data.frame(dummy = 1, x = datO$sA)
#
# TEST 1: good fit
# pool = FALSE
# glm
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

#
# TEST 2: bad fit
# pool = TRUE
# xgboost
# the pdf is not normalized
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

# p_hat$normalize()
# p_hat$display(foo2)

#
# TEST 3: good fit
# pool = FALSE
# xgboost
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)


#
# TEST 4: bad fit
# pool = TRUE
# glm
# due to glm not be able to smooth
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)


# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

@wilsoncai1992
Copy link

TEST1
test1

TEST2
test2

TEST3
test3

TEST4

test4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

3 participants