microsoft · jameslamb · May 17, 2022 · Apr 1, 2022 · Apr 1, 2022 · Apr 5, 2022
@@ -288,6 +288,10 @@ Dataset <- R6::R6Class(
         self$set_colnames(colnames = private$colnames)
       }
 
+      # If the data didn't have feature names we take the ones defined at cpp side
+      # otherwise we just overwrite them
+      self$get_colnames()
 return(list(NULL, x$get_colnames())) 
 return(list(NULL, x$get_colnames())) 
+
       # Load init score if requested
       if (!is.null(private$predictor) && is.null(private$used_indices)) {
 
@@ -381,6 +385,12 @@ Dataset <- R6::R6Class(
       if (lgb.is.null.handle(x = private$handle)) {
         stop("Cannot get number of bins in feature before constructing Dataset.")
       }
+      if (is.character(feature)) {
+        feature <- which(colnames(self) == feature)
+        if (length(feature) == 0L) {
+          stop("feature not found")
+        }
+      }
       num_bin <- integer(1L)
       .Call(
         LGBM_DatasetGetFeatureNumBin_R

@@ -547,4 +547,15 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
   )
   actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
   expect_identical(actual_num_bins, expected_num_bins)
+  # test using defined feature names
+  bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
+  expect_identical(unname(bins_by_name), expected_num_bins)
+  # test using default feature names
+  no_names_mat <- raw_mat
+  colnames(no_names_mat) <- NULL
+  ds_no_names <- lgb.Dataset(no_names_mat, params = list(min_data_in_bin = min_data_in_bin))
+  ds_no_names$construct()
+  default_names <- lapply(seq(1L, ncol(raw_mat)), function(i) sprintf("Column_%d", i - 1L))
+  bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
+  expect_identical(bins_by_default_name, expected_num_bins)
 })
@@ -1821,6 +1821,7 @@ def construct(self):
                                 feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
             if self.free_raw_data:
                 self.data = None
+            self.feature_name = self.get_feature_name()
         return self
 
     def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
@@ -2386,20 +2387,22 @@ def num_feature(self):
         else:
             raise LightGBMError("Cannot get num_feature before construct dataset")
 
-    def feature_num_bin(self, feature: int) -> int:
+    def feature_num_bin(self, feature: Union[int, str]) -> int:
         """Get the number of bins for a feature.
 
         Parameters
         ----------
-        feature : int
-            Index of the feature.
+        feature : int or str
+            Index or name of the feature.
 
         Returns
         -------
         number_of_bins : int
             The number of constructed bins for the feature in the Dataset.
         """
         if self.handle is not None:
+            if isinstance(feature, str):
+                feature = self.feature_name.index(feature)
             ret = ctypes.c_int(0)
             _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
                                                          ctypes.c_int(feature),

@@ -634,7 +634,9 @@ def test_feature_num_bin(min_data_in_bin):
         np.array([1, 2] * 49 + 2 * [np.nan]),
         np.zeros(100),
     ]).T
-    ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
+    feature_name = [f'x{i}' for i in range(X.shape[1])]
+    ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}, feature_name=feature_name)
+    ds.construct()
     expected_num_bins = [
         100 // min_data_in_bin + 1,  # extra bin for zero
         3,  # 0, 1, 2
@@ -644,6 +646,15 @@ def test_feature_num_bin(min_data_in_bin):
     ]
     actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
     assert actual_num_bins == expected_num_bins
+    # test using defined feature names
+    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
+    assert bins_by_name == expected_num_bins
+    # test using default feature names
+    ds_no_names = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin})
+    ds_no_names.construct()
+    default_names = [f'Column_{i}' for i in range(X.shape[1])]
+    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
+    assert bins_by_default_name == expected_num_bins
 
 
 def test_feature_num_bin_with_max_bin_by_feature():