From 22b49d9e001cd738f507c7c01c8c212585fb32af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 31 Mar 2022 20:19:32 -0600 Subject: [PATCH 1/8] allow using feature names when retrieving number of bins --- R-package/R/lgb.Dataset.R | 6 ++++++ R-package/tests/testthat/test_dataset.R | 2 ++ python-package/lightgbm/basic.py | 8 +++++--- tests/python_package_test/test_basic.py | 6 +++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 191ca80db379..0fa86d285050 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -381,6 +381,12 @@ Dataset <- R6::R6Class( if (lgb.is.null.handle(x = private$handle)) { stop("Cannot get number of bins in feature before constructing Dataset.") } + if (is.character(feature)) { + feature <- which(colnames(self) == feature) + if (length(feature) == 0L) { + stop("feature not found") + } + } num_bin <- integer(1L) .Call( LGBM_DatasetGetFeatureNumBin_R diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 7e3778f44505..534c4e9330bc 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -547,4 +547,6 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { ) actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin) expect_identical(actual_num_bins, expected_num_bins) + bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin) + expect_identical(bins_by_name, expected_num_bins) }) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index aae331028a03..b4bcc7b20967 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2386,13 +2386,13 @@ def num_feature(self): else: raise LightGBMError("Cannot get num_feature before construct dataset") - def feature_num_bin(self, feature: int) -> int: + def feature_num_bin(self, feature: Union[int, str]) -> int: """Get the number of bins for a feature. Parameters ---------- - feature : int - Index of the feature. + feature : int or str + Index or name of the feature. Returns ------- @@ -2400,6 +2400,8 @@ def feature_num_bin(self, feature: int) -> int: The number of constructed bins for the feature in the Dataset. """ if self.handle is not None: + if isinstance(feature, str): + feature = self.feature_name.index(feature) ret = ctypes.c_int(0) _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle, ctypes.c_int(feature), diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 09946c93178b..ea947c08f738 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -634,7 +634,9 @@ def test_feature_num_bin(min_data_in_bin): np.array([1, 2] * 49 + 2 * [np.nan]), np.zeros(100), ]).T - ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct() + feature_name = [f'x{i}' for i in range(X.shape[1])] + ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}, feature_name=feature_name) + ds.construct() expected_num_bins = [ 100 // min_data_in_bin + 1, # extra bin for zero 3, # 0, 1, 2 @@ -644,6 +646,8 @@ def test_feature_num_bin(min_data_in_bin): ] actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] assert actual_num_bins == expected_num_bins + bins_by_name = [ds.feature_num_bin(name) for name in feature_name] + assert bins_by_name == expected_num_bins def test_feature_num_bin_with_max_bin_by_feature(): From b52c2b8e25c5c89b547e1cf1067d4727025ea097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 31 Mar 2022 20:50:46 -0600 Subject: [PATCH 2/8] unname vector --- R-package/tests/testthat/test_dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 534c4e9330bc..f1e20238da58 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -548,5 +548,5 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin) expect_identical(actual_num_bins, expected_num_bins) bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin) - expect_identical(bins_by_name, expected_num_bins) + expect_identical(unname(bins_by_name), expected_num_bins) }) From 87b57ca6558bd42b1aea8b437fae18dad15f1411 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Tue, 5 Apr 2022 16:25:17 -0500 Subject: [PATCH 3/8] use default feature names when not defined --- R-package/R/lgb.Dataset.R | 4 ++++ R-package/tests/testthat/test_dataset.R | 9 +++++++++ python-package/lightgbm/basic.py | 1 + tests/python_package_test/test_basic.py | 7 +++++++ 4 files changed, 21 insertions(+) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 0fa86d285050..e80b05cca5bc 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -288,6 +288,10 @@ Dataset <- R6::R6Class( self$set_colnames(colnames = private$colnames) } + # If the data didn't have feature names we take the ones defined at cpp side + # otherwise we just overwrite them + self$get_colnames() + # Load init score if requested if (!is.null(private$predictor) && is.null(private$used_indices)) { diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index f1e20238da58..5eb5e602bb6d 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -547,6 +547,15 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { ) actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin) expect_identical(actual_num_bins, expected_num_bins) + # test using defined feature names bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin) expect_identical(unname(bins_by_name), expected_num_bins) + # test using default feature names + no_names_mat <- raw_mat + colnames(no_names_mat) <- NULL + ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin)) + ds_no_names$construct() + default_names <- lapply(seq(1L, ncol(raw_mat)), function(i) sprintf("Column_%d", i - 1)) + bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin) + expect_identical(bins_by_default_name, expected_num_bins) }) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b4bcc7b20967..0b7893d89035 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1821,6 +1821,7 @@ def construct(self): feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params) if self.free_raw_data: self.data = None + self.feature_name = self.get_feature_name() return self def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None): diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index ea947c08f738..4fa2c1b17701 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -646,8 +646,15 @@ def test_feature_num_bin(min_data_in_bin): ] actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] assert actual_num_bins == expected_num_bins + # test using defined feature names bins_by_name = [ds.feature_num_bin(name) for name in feature_name] assert bins_by_name == expected_num_bins + # test using default feature names + ds_no_names = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}) + ds_no_names.construct() + default_names = [f'Column_{i}' for i in range(X.shape[1])] + bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names] + assert bins_by_default_name == expected_num_bins def test_feature_num_bin_with_max_bin_by_feature(): From fa00975b44df273ec7d3e340faa388a51b91c9bd Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Tue, 5 Apr 2022 16:37:31 -0500 Subject: [PATCH 4/8] lint --- R-package/tests/testthat/test_dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 5eb5e602bb6d..a7d203072865 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -555,7 +555,7 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { colnames(no_names_mat) <- NULL ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin)) ds_no_names$construct() - default_names <- lapply(seq(1L, ncol(raw_mat)), function(i) sprintf("Column_%d", i - 1)) + default_names <- lapply(seq(1L, ncol(raw_mat)), function(i) sprintf("Column_%d", i - 1L)) bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin) expect_identical(bins_by_default_name, expected_num_bins) }) From fe6c9b4b6cf0f60af366dc402acaad9c182ceb31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 25 Apr 2022 22:09:57 -0500 Subject: [PATCH 5/8] apply suggestions --- R-package/R/lgb.Dataset.R | 5 +++-- R-package/tests/testthat/test_dataset.R | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index e80b05cca5bc..dd905cefd100 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -386,9 +386,10 @@ Dataset <- R6::R6Class( stop("Cannot get number of bins in feature before constructing Dataset.") } if (is.character(feature)) { - feature <- which(colnames(self) == feature) + feature_name <- feature + feature <- which(private$colnames == feature_name) if (length(feature) == 0L) { - stop("feature not found") + stop(sprintf("feature '%s' not found", feature_name)) } } num_bin <- integer(1L) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index a7d203072865..9769be6747ae 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -555,7 +555,12 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { colnames(no_names_mat) <- NULL ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin)) ds_no_names$construct() - default_names <- lapply(seq(1L, ncol(raw_mat)), function(i) sprintf("Column_%d", i - 1L)) + default_names <- lapply( + X = seq(1L, ncol(raw_mat)), + , FUN = function(i) { + sprintf("Column_%d", i - 1L) + } + ) bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin) expect_identical(bins_by_default_name, expected_num_bins) }) From bc5931f7f62765da7e92c5bcfcc19708ce07b9bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 26 Apr 2022 11:09:53 -0500 Subject: [PATCH 6/8] remove extra comma --- R-package/tests/testthat/test_dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 9769be6747ae..310b8abb967c 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -556,7 +556,7 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin)) ds_no_names$construct() default_names <- lapply( - X = seq(1L, ncol(raw_mat)), + X = seq(1L, ncol(raw_mat)) , FUN = function(i) { sprintf("Column_%d", i - 1L) } From 912a6498389a9088c03f4ea2f1d21b28f36b3da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 13 May 2022 16:32:31 -0500 Subject: [PATCH 7/8] add test with categorical feature --- R-package/tests/testthat/test_dataset.R | 23 +++++++++++++++++------ tests/python_package_test/test_basic.py | 15 ++++++++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 8f145961ed14..7e91bb621afa 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -533,10 +533,16 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { , three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0) , two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_) , all_zero = rep(0.0, 100L) + , categorical = sample.int(2L, 100L, replace = TRUE) ) + n_features <- ncol(raw_df) raw_mat <- data.matrix(raw_df) min_data_in_bin <- 2L - ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin)) + ds <- lgb.Dataset( + raw_mat + , params = list(min_data_in_bin = min_data_in_bin) + , categorical_feature = n_features + ) ds$construct() expected_num_bins <- c( 100L %/% min_data_in_bin + 1L # extra bin for zero @@ -544,8 +550,9 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { , 3L # 0, 1, 2 , 4L # 0, 1, 2 + NA , 0L # unused + , 3L # 1, 2 + NA ) - actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin) + actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin) expect_identical(actual_num_bins, expected_num_bins) # test using defined feature names bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin) @@ -553,7 +560,11 @@ test_that("lgb.Dataset$get_feature_num_bin() works", { # test using default feature names no_names_mat <- raw_mat colnames(no_names_mat) <- NULL - ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin)) + ds_no_names <- lgb.Dataset( + no_names_mat + , params = list(min_data_in_bin = min_data_in_bin) + , categorical_feature = n_features + ) ds_no_names$construct() default_names <- lapply( X = seq(1L, ncol(raw_mat)) @@ -571,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct() sparse_mat <- as(raw_mat, "dgCMatrix") ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct() - # check that the column names are NULL - expect_null(ds$.__enclos_env__$private$colnames) - expect_null(ds2$.__enclos_env__$private$colnames) + # check that the column names are the default ones + expect_equal(ds$.__enclos_env__$private$colnames, "Column_0") + expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0") # check for error when index is greater than the number of columns expect_error({ lgb.Dataset(raw_mat, categorical_feature = 2L)$construct() diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 21e55d73e1a5..378bbbfe6078 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -663,16 +663,22 @@ def test_feature_num_bin(min_data_in_bin): np.array([0, 1, 2] * 33 + [0]), np.array([1, 2] * 49 + 2 * [np.nan]), np.zeros(100), + np.random.choice([0, 1], 100), ]).T - feature_name = [f'x{i}' for i in range(X.shape[1])] - ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}, feature_name=feature_name) - ds.construct() + n_continuous = X.shape[1] - 1 + feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1'] + ds_kwargs = dict( + params={'min_data_in_bin': min_data_in_bin}, + categorical_feature=[n_continuous], # last feature + ) + ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct() expected_num_bins = [ 100 // min_data_in_bin + 1, # extra bin for zero 3, # 0, 1, 2 3, # 0, 1, 2 4, # 0, 1, 2 + nan 0, # unused + 3, # 0, 1 + nan ] actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] assert actual_num_bins == expected_num_bins @@ -680,8 +686,7 @@ def test_feature_num_bin(min_data_in_bin): bins_by_name = [ds.feature_num_bin(name) for name in feature_name] assert bins_by_name == expected_num_bins # test using default feature names - ds_no_names = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}) - ds_no_names.construct() + ds_no_names = lgb.Dataset(X, **ds_kwargs).construct() default_names = [f'Column_{i}' for i in range(X.shape[1])] bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names] assert bins_by_default_name == expected_num_bins From 302a6a17463d2170ecc3a86e71d4625870a32ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 16 May 2022 11:39:15 -0500 Subject: [PATCH 8/8] make feature names sync more transparent --- R-package/R/lgb.Dataset.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 5236fd5a3257..a9cc3c870fc3 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -289,9 +289,12 @@ Dataset <- R6::R6Class( self$set_colnames(colnames = private$colnames) } - # If the data didn't have feature names we take the ones defined at cpp side - # otherwise we just overwrite them - self$get_colnames() + # Ensure that private$colnames matches the feature names on the C++ side. This line is necessary + # in cases like constructing from a file or from a matrix with no column names. + private$colnames <- .Call( + LGBM_DatasetGetFeatureNames_R + , private$handle + ) # Load init score if requested if (!is.null(private$predictor) && is.null(private$used_indices)) {