Poor pooled estimates in simple case #15

jeremyrcoyle · 2017-12-13T18:24:25Z

The following code attempts to fit a marginal density using both pooled and unpooled condensier estimates by way of sl3. The true density is standard normal. The unpooled estimates (red) approximate the true density(blue), but the unpooled estimates (green) do not.

options(sl3.verbose = FALSE)
library("condensier")
library("sl3")
library("simcausal")
library(ggplot2)


D <- DAG.empty()
D <- D + node("I", distr = "rbern", prob = 1) +
     node("B", distr = "rnorm", mean = 0, sd = 1)

D <- set.DAG(D, n.test = 10)
datO <- sim(D, n = 10000, rndseed = 12345)

# ================================================================================
task <- sl3_Task$new(datO, covariates=c("I"),outcome="B")
lrn_unpooled <- Lrnr_condensier$new(nbins = 25, bin_method = "equal.len", pool = FALSE,
                            bin_estimator = Lrnr_glm_fast$new(family = binomial()))
lrn_unpooled_fit <- lrn_unpooled$train(task)

lrn_pooled <- Lrnr_condensier$new(nbins = 25, bin_method = "equal.len", pool = TRUE,
                                    bin_estimator = Lrnr_glm_fast$new(family = binomial()))
lrn_pooled_fit <- lrn_pooled$train(task)


# ================================================================================


# evaluate fit on training data
datO$pred_fB_unpooled <- lrn_unpooled_fit$predict(task)
datO$pred_fB_pooled <- lrn_pooled_fit$predict(task)
datO$fB <- dnorm(datO$B)
long <- melt(datO, id=c("B"), measure=c("pred_fB_unpooled","pred_fB_pooled","fB"))
ggplot(long, aes(x=B, y=value, color=variable)) + geom_point() + theme_bw()

The text was updated successfully, but these errors were encountered:

wilsoncai1992 · 2017-12-13T20:46:47Z

I also meet with a case where pool = TRUE tend to lead to weird results, holding other conditions the same.

In my case, densities will not integrate to 1 when I fit a univariate density.

# bimodal data
library("simcausal")
D <- DAG.empty()
D <-
  D + node("W1", distr = "rbern", prob = 0.5) +
  node("sA.mu", distr = "rconst", const = 4*W1 - 2) +
  node("sA", distr = "rnorm", mean = sA.mu, sd = 1)
D <- set.DAG(D, n.test = 10)
datO <- sim(D, n = 1e4, rndseed = 12345)


library("condensier")
library("sl3")
library(origami)
dat_dummy <- data.frame(dummy = 1, x = datO$sA)
#
# TEST 1: good fit
# pool = FALSE
# glm
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = speedglmR6$new())
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

#
# TEST 2: bad fit
# pool = TRUE
# xgboost
# the pdf is not normalized
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

# p_hat$normalize()
# p_hat$display(foo2)

#
# TEST 3: good fit
# pool = FALSE
# xgboost
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = FALSE,
                            bin_estimator = Lrnr_xgboost$new(nrounds = 50, objective = "reg:logistic"))
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)

# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)


#
# TEST 4: bad fit
# pool = TRUE
# glm
# due to glm not be able to smooth
# ---------------------------------------------------------------------------------------
task <- sl3_Task$new(dat_dummy, covariates=c("dummy"),outcome="x", folds = origami::make_folds(n = length(dat_dummy$x), V = 3))

lrn1 <- Lrnr_condensier$new(nbins = 20, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
lrn2 <- Lrnr_condensier$new(nbins = 100, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
lrn3 <- Lrnr_condensier$new(nbins = 200, bin_method = "equal.mass", pool = TRUE,
                            bin_estimator = speedglmR6$new())
learner_stack <- Stack$new(lrn1, lrn2, lrn3)
cv_stack <- Lrnr_cv$new(learner_stack)
cv_fit <- cv_stack$train(task)
risks <- cv_fit$cv_risk(loss_squared_error)
lrn_list <- list(lrn1, lrn2, lrn3)
best_fit <- lrn_list[[which.min(risks)]]$train(task)

p_hat <- empiricalDensity$new(best_fit$predict()[[1]], dat_dummy$x)


# true p
foo2 <- function(x) {(.5*dnorm(x, mean = 2) + .5*dnorm(x, mean = -2))}
p_hat$display(foo2)

wilsoncai1992 · 2017-12-13T20:51:11Z

TEST1

TEST2

TEST3

TEST4

osofr mentioned this issue Dec 13, 2017

WIP adding *untested* implementation of the weights #14

Closed

osofr added the bug label Dec 14, 2017

nhejazi mentioned this issue Nov 24, 2018

[WIP] HAL density learner tlverse/sl3#178

Closed

nhejazi mentioned this issue Jul 12, 2020

[PRE REVIEW]: haldensify: Highly adaptive lasso conditional density estimation in R openjournals/joss-reviews#2389

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Poor pooled estimates in simple case #15

Poor pooled estimates in simple case #15

jeremyrcoyle commented Dec 13, 2017

wilsoncai1992 commented Dec 13, 2017

wilsoncai1992 commented Dec 13, 2017

Poor pooled estimates in simple case #15

Poor pooled estimates in simple case #15

Comments

jeremyrcoyle commented Dec 13, 2017

wilsoncai1992 commented Dec 13, 2017

wilsoncai1992 commented Dec 13, 2017