From 90f68289d2c057f5ba07e13c2e2ceb3275ba9b1d Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-12@hotmail.com>
Date: Thu, 24 Feb 2022 00:06:14 +0300
Subject: [PATCH 1/2] fixes for supporting 2d numpy arrays for predictions,
 grads and hess in multiclass custom objective

---
 python-package/lightgbm/basic.py    | 31 ++++++++++--------------
 python-package/lightgbm/engine.py   | 37 +++++++++++++----------------
 python-package/lightgbm/plotting.py |  4 ++--
 python-package/lightgbm/sklearn.py  | 26 +++++++-------------
 4 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index f1d20abd055e..ffbe160b6b15 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -2751,7 +2751,7 @@ def trees_to_dataframe(self):
             - ``missing_direction`` : str, split direction that missing values should go to. ``None`` for leaf nodes.
             - ``missing_type`` : str, describes what types of values are treated as missing.
             - ``value`` : float64, predicted value for this leaf node, multiplied by the learning rate.
-            - ``weight`` : float64 or int64, sum of hessian (second-order derivative of objective), summed over observations that fall in this node.
+            - ``weight`` : float64 or int64, sum of Hessian (second-order derivative of objective), summed over observations that fall in this node.
             - ``count`` : int64, number of records in the training data that fall into this node.
 
         Returns
@@ -2960,7 +2960,7 @@ def update(self, train_set=None, fobj=None):
                     The value of the second order derivative (Hessian) of the loss
                     with respect to the elements of preds for each sample point.
 
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
             and grad and hess should be returned in the same format.
 
         Returns
@@ -2999,9 +2999,6 @@ def update(self, train_set=None, fobj=None):
             if not self.__set_objective_to_none:
                 self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
             grad, hess = fobj(self.__inner_predict(0), self.train_set)
-            if self.num_model_per_iteration() > 1:
-                grad = grad.ravel(order='F')
-                hess = hess.ravel(order='F')
             return self.__boost(grad, hess)
 
     def __boost(self, grad, hess):
@@ -3011,7 +3008,7 @@ def __boost(self, grad, hess):
 
             Score is returned before any transformation,
             e.g. it is raw margin instead of probability of positive class for binary task.
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, score are numpy 2-D array of shape = [n_samples, n_classes],
             and grad and hess should be returned in the same format.
 
         Parameters
@@ -3028,6 +3025,9 @@ def __boost(self, grad, hess):
         is_finished : bool
             Whether the boost was successfully finished.
         """
+        if self.__num_class > 1:
+            grad = grad.ravel(order='F')
+            hess = hess.ravel(order='F')
         grad = list_to_1d_numpy(grad, name='gradient')
         hess = list_to_1d_numpy(hess, name='hessian')
         assert grad.flags.c_contiguous
@@ -3035,12 +3035,11 @@ def __boost(self, grad, hess):
         if len(grad) != len(hess):
             raise ValueError(f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) don't match")
         num_train_data = self.train_set.num_data()
-        num_models = self.__num_class
-        if len(grad) != num_train_data * num_models:
+        if len(grad) != num_train_data * self.__num_class:
             raise ValueError(
                 f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) "
                 f"don't match training data length ({num_train_data}) * "
-                f"number of models per one iteration ({num_models})"
+                f"number of models per one iteration ({self.__num_class})"
             )
         is_finished = ctypes.c_int(0)
         _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
@@ -3148,8 +3147,9 @@ def eval(self, data, name, feval=None):
             Should accept two parameters: preds, eval_data,
             and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                     If ``fobj`` is specified, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
@@ -3161,9 +3161,6 @@ def eval(self, data, name, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
         Returns
         -------
         result : list
@@ -3198,6 +3195,7 @@ def eval_train(self, feval=None):
 
                 preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                     If ``fobj`` is specified, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
@@ -3209,9 +3207,6 @@ def eval_train(self, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
         Returns
         -------
         result : list
@@ -3231,6 +3226,7 @@ def eval_valid(self, feval=None):
 
                 preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                     If ``fobj`` is specified, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
@@ -3242,9 +3238,6 @@ def eval_valid(self, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
         Returns
         -------
         result : list
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 2b8a630b1915..9b4ff70c217c 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -9,13 +9,12 @@
 import numpy as np
 
 from . import callback
-from .basic import (Booster, Dataset, LightGBMError, _ArrayLike, _choose_param_value, _ConfigAliases, _InnerPredictor,
-                    _log_warning)
+from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, _log_warning
 from .compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold
 
 _LGBM_CustomObjectiveFunction = Callable[
     [np.ndarray, Dataset],
-    Tuple[_ArrayLike, _ArrayLike]
+    Tuple[np.ndarray, np.ndarray]
 ]
 _LGBM_CustomMetricFunction = Callable[
     [np.ndarray, Dataset],
@@ -56,30 +55,30 @@ def train(
         Should accept two parameters: preds, train_data,
         and return (grad, hess).
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
                 Predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task.
             train_data : Dataset
                 The training dataset.
-            grad : list, numpy 1-D array or pandas Series
+            grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The value of the first order derivative (gradient) of the loss
                 with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
+            hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The value of the second order derivative (Hessian) of the loss
                 with respect to the elements of preds for each sample point.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+        and grad and hess should be returned in the same format.
 
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
         Each evaluation function should accept two parameters: preds, eval_data,
         and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                 If ``fobj`` is specified, predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
             eval_data : Dataset
@@ -91,8 +90,6 @@ def train(
             is_higher_better : bool
                 Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
         To ignore the default metric corresponding to the used objective,
         set the ``metric`` parameter to the string ``"None"`` in ``params``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
@@ -411,30 +408,30 @@ def cv(params, train_set, num_boost_round=100,
         Should accept two parameters: preds, train_data,
         and return (grad, hess).
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
                 Predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task.
             train_data : Dataset
                 The training dataset.
-            grad : list, numpy 1-D array or pandas Series
+            grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The value of the first order derivative (gradient) of the loss
                 with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
+            hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The value of the second order derivative (Hessian) of the loss
                 with respect to the elements of preds for each sample point.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+        and grad and hess should be returned in the same format.
 
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
         Each evaluation function should accept two parameters: preds, eval_data,
         and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                 If ``fobj`` is specified, predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
             eval_data : Dataset
@@ -446,8 +443,6 @@ def cv(params, train_set, num_boost_round=100,
             is_higher_better : bool
                 Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
         To ignore the default metric corresponding to the used objective,
         set ``metrics`` to the string ``"None"``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py
index eb625c8a1193..f7d35045d21f 100644
--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -556,7 +556,7 @@ def create_tree_digraph(
             - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
             - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
             - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
             - ``'data_percentage'`` : percentage of training data that fall into this node
     precision : int or None, optional (default=3)
         Used to restrict the display of floating point values to a certain precision.
@@ -649,7 +649,7 @@ def plot_tree(
             - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
             - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
             - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
             - ``'data_percentage'`` : percentage of training data that fall into this node
     precision : int or None, optional (default=3)
         Used to restrict the display of floating point values to a certain precision.
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 2d401fc526b8..77bf3ffedd1e 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from .basic import Booster, Dataset, LightGBMError, _ArrayLike, _choose_param_value, _ConfigAliases, _log_warning
+from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _log_warning
 from .callback import record_evaluation
 from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                      _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
@@ -19,11 +19,11 @@
 _LGBM_ScikitCustomObjectiveFunction = Union[
     Callable[
         [np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
     ],
     Callable[
         [np.ndarray, np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
     ],
 ]
 _LGBM_ScikitCustomEvalFunction = Union[
@@ -72,13 +72,13 @@ def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction):
                 grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
                     The value of the first order derivative (gradient) of the loss
                     with respect to the elements of y_pred for each sample point.
-                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                     The value of the second order derivative (Hessian) of the loss
                     with respect to the elements of y_pred for each sample point.
 
         .. note::
 
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, y_pred are numpy 2-D array of shape = [n_samples, n_classes],
             and grad and hess should be returned in the same format.
         """
         self.func = func
@@ -95,10 +95,10 @@ def __call__(self, preds, dataset):
 
         Returns
         -------
-        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The value of the first order derivative (gradient) of the loss
             with respect to the elements of preds for each sample point.
-        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The value of the second order derivative (Hessian) of the loss
             with respect to the elements of preds for each sample point.
         """
@@ -162,11 +162,6 @@ def __init__(self, func: _LGBM_ScikitCustomEvalFunction):
                     The eval result.
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-        .. note::
-
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
         """
         self.func = func
 
@@ -297,9 +292,6 @@ def __call__(self, preds, dataset):
             The eval result.
         is_higher_better : bool
             Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-    For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-    and grad and hess should be returned in the same format.
 """
 
 _lgbmmodel_doc_predict = (
@@ -415,7 +407,7 @@ def __init__(
         min_split_gain : float, optional (default=0.)
             Minimum loss reduction required to make a further partition on a leaf node of the tree.
         min_child_weight : float, optional (default=1e-3)
-            Minimum sum of instance weight (hessian) needed in a child (leaf).
+            Minimum sum of instance weight (Hessian) needed in a child (leaf).
         min_child_samples : int, optional (default=20)
             Minimum number of data needed in a child (leaf).
         subsample : float, optional (default=1.)
@@ -473,7 +465,7 @@ def __init__(
                 The value of the second order derivative (Hessian) of the loss
                 with respect to the elements of y_pred for each sample point.
 
-        For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+        For multi-class task, y_pred are numpy 2-D array of shape = [n_samples, n_classes],
         and grad and hess should be returned in the same format.
         """
         if not SKLEARN_INSTALLED:

From b8e10387ea9fa9de9e6e6c4b805fbc42c0f0ea27 Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-08@mail.ru>
Date: Thu, 24 Feb 2022 01:39:00 +0300
Subject: [PATCH 2/2] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: José Morales <jmoralz92@gmail.com>
---
 python-package/lightgbm/sklearn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 77bf3ffedd1e..7ebba0bc962c 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -78,7 +78,7 @@ def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction):
 
         .. note::
 
-            For multi-class task, y_pred are numpy 2-D array of shape = [n_samples, n_classes],
+            For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
             and grad and hess should be returned in the same format.
         """
         self.func = func
@@ -465,7 +465,7 @@ def __init__(
                 The value of the second order derivative (Hessian) of the loss
                 with respect to the elements of y_pred for each sample point.
 
-        For multi-class task, y_pred are numpy 2-D array of shape = [n_samples, n_classes],
+        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
         and grad and hess should be returned in the same format.
         """
         if not SKLEARN_INSTALLED: