Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R-package] factor out {ggplot2} #3224

Merged
merged 6 commits into from
Jul 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/test_r_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
exit -1
fi

ALLOWED_CHECK_NOTES=3
ALLOWED_CHECK_NOTES=2
NUM_CHECK_NOTES=$(
cat ${LOG_FILE_NAME} \
| grep -e '^Status: .* NOTE.*' \
Expand Down
2 changes: 1 addition & 1 deletion .ci/test_r_package_windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ if ($env:COMPILER -ne "MSVC") {
$note_str = Get-Content -Path "${LOG_FILE_NAME}" | Select-String -Pattern '.*Status.* NOTE' | Out-String ; Check-Output $?
$relevant_line = $note_str -match '(\d+) NOTE'
$NUM_CHECK_NOTES = $matches[1]
$ALLOWED_CHECK_NOTES = 3
$ALLOWED_CHECK_NOTES = 2
if ([int]$NUM_CHECK_NOTES -gt $ALLOWED_CHECK_NOTES) {
Write-Output "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed"
Check-Output $False
Expand Down
1 change: 0 additions & 1 deletion R-package/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ BugReports: https://github.com/Microsoft/LightGBM/issues
NeedsCompilation: yes
Biarch: false
Suggests:
ggplot2 (>= 1.0.1),
processx,
testthat
Depends:
Expand Down
140 changes: 82 additions & 58 deletions R-package/demo/leaf_stability.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,89 @@
# Obviously, we are in a controlled environment, without issues (real rules).
# Do not do this in a real scenario.

# First, we load our libraries
library(lightgbm)
library(ggplot2)

# Second, we load our data
# define helper functions for creating plots

# output of `RColorBrewer::brewer.pal(10, "RdYlGn")`, hardcooded here to avoid a dependency
jameslamb marked this conversation as resolved.
Show resolved Hide resolved
.diverging_palette <- c(
"#A50026", "#D73027", "#F46D43", "#FDAE61", "#FEE08B"
, "#D9EF8B", "#A6D96A", "#66BD63", "#1A9850", "#006837"
)

.prediction_depth_plot <- function(df) {
plot(
x = df$X
, y = df$Y
, type = "p"
, main = "Prediction Depth"
, xlab = "Leaf Bin"
, ylab = "Prediction Probability"
, pch = 19L
, col = .diverging_palette[df$binned + 1L]
)
legend(
"topright"
, title = "bin"
, legend = sort(unique(df$binned))
, pch = 19L
, col = .diverging_palette[sort(unique(df$binned + 1L))]
, cex = 0.7
)
}

.prediction_depth_spread_plot <- function(df) {
plot(
x = df$binned
, xlim = c(0L, 9L)
, y = df$Z
, type = "p"
, main = "Prediction Depth Spread"
, xlab = "Leaf Bin"
, ylab = "Logloss"
, pch = 19L
, col = .diverging_palette[df$binned + 1L]
)
legend(
"topright"
, title = "bin"
, legend = sort(unique(df$binned))
, pch = 19L
, col = .diverging_palette[sort(unique(df$binned + 1L))]
, cex = 0.7
)
}

.depth_density_plot <- function(df) {
plot(
x = density(df$Y)
, xlim = c(min(df$Y), max(df$Y))
, type = "p"
, main = "Depth Density"
, xlab = "Prediction Probability"
, ylab = "Bin Density"
, pch = 19L
, col = .diverging_palette[df$binned + 1L]
)
legend(
"topright"
, title = "bin"
, legend = sort(unique(df$binned))
, pch = 19L
, col = .diverging_palette[sort(unique(df$binned + 1L))]
, cex = 0.7
)
}

# load some data
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
data(agaricus.test, package = "lightgbm")
test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)

# Third, we setup parameters and we train a model
# setup parameters and we train a model
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(
Expand Down Expand Up @@ -59,33 +129,16 @@ new_data$binned <- .bincode(
, include.lowest = TRUE
)
new_data$binned[is.na(new_data$binned)] <- 0L
new_data$binned <- as.factor(new_data$binned)

# We can check the binned content
table(new_data$binned)

# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is smooth!
ggplot(
data = new_data
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")

.prediction_depth_plot(df = new_data)
.prediction_depth_spread_plot(df = new_data)
.depth_density_plot(df = new_data)

# Now, let's show with other parameters
model2 <- lgb.train(
Expand Down Expand Up @@ -126,7 +179,6 @@ new_data2$binned <- .bincode(
, include.lowest = TRUE
)
new_data2$binned[is.na(new_data2$binned)] <- 0L
new_data2$binned <- as.factor(new_data2$binned)

# We can check the binned content
table(new_data2$binned)
Expand All @@ -136,25 +188,9 @@ table(new_data2$binned)
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are
# real thus it is not an issue
# However, if the rules were not true, the loss would explode.
ggplot(
data = new_data2
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data2
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")

.prediction_depth_plot(df = new_data2)
.prediction_depth_spread_plot(df = new_data2)
.depth_density_plot(df = new_data2)

# Now, try with very severe overfitting
model3 <- lgb.train(
Expand Down Expand Up @@ -195,7 +231,6 @@ new_data3$binned <- .bincode(
, include.lowest = TRUE
)
new_data3$binned[is.na(new_data3$binned)] <- 0L
new_data3$binned <- as.factor(new_data3$binned)

# We can check the binned content
table(new_data3$binned)
Expand All @@ -204,18 +239,7 @@ table(new_data3$binned)
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
# are real thus it is not an issue.
# However, if the rules were not true, the loss would explode. See the sudden spikes?
ggplot(
data = new_data3
, mapping = aes(x = Y, y = ..count.., fill = binned)
) +
geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
.depth_density_plot(df = new_data3)

# Compare with our second model, the difference is severe. This is smooth.
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
.depth_density_plot(df = new_data2)