Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package][R-package] allow using feature names when retrieving number of bins #5116

Merged
merged 10 commits into from
May 17, 2022
14 changes: 14 additions & 0 deletions R-package/R/lgb.Dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
self$set_colnames(colnames = private$colnames)
}

# Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
# in cases like constructing from a file or from a matrix with no column names.
private$colnames <- .Call(
LGBM_DatasetGetFeatureNames_R
, private$handle
)

# Load init score if requested
if (!is.null(private$predictor) && is.null(private$used_indices)) {

Expand Down Expand Up @@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.")
}
if (is.character(feature)) {
feature_name <- feature
feature <- which(private$colnames == feature_name)
if (length(feature) == 0L) {
stop(sprintf("feature '%s' not found", feature_name))
}
}
num_bin <- integer(1L)
.Call(
LGBM_DatasetGetFeatureNumBin_R
Expand Down
37 changes: 32 additions & 5 deletions R-package/tests/testthat/test_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -533,20 +533,47 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
, three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
, two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
, all_zero = rep(0.0, 100L)
, categorical = sample.int(2L, 100L, replace = TRUE)
)
n_features <- ncol(raw_df)
raw_mat <- data.matrix(raw_df)
min_data_in_bin <- 2L
ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
ds <- lgb.Dataset(
raw_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds$construct()
expected_num_bins <- c(
100L %/% min_data_in_bin + 1L # extra bin for zero
, 3L # 0, 1, 2
, 3L # 0, 1, 2
, 4L # 0, 1, 2 + NA
, 0L # unused
, 3L # 1, 2 + NA
)
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins)
# test using defined feature names
bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
expect_identical(unname(bins_by_name), expected_num_bins)
# test using default feature names
no_names_mat <- raw_mat
colnames(no_names_mat) <- NULL
ds_no_names <- lgb.Dataset(
no_names_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds_no_names$construct()
default_names <- lapply(
X = seq(1L, ncol(raw_mat))
, FUN = function(i) {
sprintf("Column_%d", i - 1L)
}
)
bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
expect_identical(bins_by_default_name, expected_num_bins)
})

test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
Expand All @@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
sparse_mat <- as(raw_mat, "dgCMatrix")
ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
# check that the column names are NULL
expect_null(ds$.__enclos_env__$private$colnames)
expect_null(ds2$.__enclos_env__$private$colnames)
# check that the column names are the default ones
expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
# check for error when index is greater than the number of columns
expect_error({
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
Expand Down
9 changes: 6 additions & 3 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,6 +1817,7 @@ def construct(self):
feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data:
self.data = None
self.feature_name = self.get_feature_name()
return self

def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
Expand Down Expand Up @@ -2382,20 +2383,22 @@ def num_feature(self):
else:
raise LightGBMError("Cannot get num_feature before construct dataset")

def feature_num_bin(self, feature: int) -> int:
def feature_num_bin(self, feature: Union[int, str]) -> int:
"""Get the number of bins for a feature.

Parameters
----------
feature : int
Index of the feature.
feature : int or str
Index or name of the feature.

Returns
-------
number_of_bins : int
The number of constructed bins for the feature in the Dataset.
"""
if self.handle is not None:
if isinstance(feature, str):
feature = self.feature_name.index(feature)
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
ret = ctypes.c_int(0)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
ctypes.c_int(feature),
Expand Down
18 changes: 17 additions & 1 deletion tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]).T
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
ds_kwargs = dict(
params={'min_data_in_bin': min_data_in_bin},
categorical_feature=[n_continuous], # last feature
)
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero
3, # 0, 1, 2
3, # 0, 1, 2
4, # 0, 1, 2 + nan
0, # unused
3, # 0, 1 + nan
]
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
assert actual_num_bins == expected_num_bins
# test using defined feature names
bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
assert bins_by_name == expected_num_bins
# test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range
num_features = X.shape[1]
with pytest.raises(
Expand Down