From a397d23f360c45e2f132848f6dbc0a0c57012386 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 11 Jun 2021 13:26:31 -0500 Subject: [PATCH 01/26] test_classifier working --- python-package/lightgbm/compat.py | 5 +++ python-package/lightgbm/dask.py | 47 +++++++++++++++++++++++++- tests/python_package_test/test_dask.py | 2 +- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 068a667b2603..931a966c2a06 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -125,6 +125,7 @@ class _LGBMRegressorBase: # type: ignore try: from dask import delayed from dask.array import Array as dask_Array + from dask.bag import from_sequence as dask_bag_from_sequence from dask.dataframe import DataFrame as dask_DataFrame from dask.dataframe import Series as dask_Series from dask.distributed import Client, default_client, wait @@ -146,6 +147,10 @@ class dask_Array: # type: ignore pass + def dask_bag_from_sequence(*args, **kwargs): # type: ignore + """Placeholder for ``dask.bag.from_sequence()``""" + pass + class dask_DataFrame: # type: ignore """Dummy class for dask.dataframe.DataFrame.""" diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 8dd96e980161..f84d4f6407ac 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -9,6 +9,7 @@ import socket from collections import defaultdict from copy import deepcopy +from functools import partial from typing import Any, Callable, Dict, List, Optional, Type, Union from urllib.parse import urlparse @@ -16,7 +17,7 @@ import scipy.sparse as ss from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call -from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, +from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_sequence, dask_Array, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict @@ -59,6 +60,12 @@ def _find_random_open_port() -> int: return port +def _is_dask_sparse_array(arr: dask_Array) -> bool: + return isinstance( + arr._meta, + (ss.csc.csc_matrix, ss.csr.csr_matrix) + ) + def _concat(seq: List[_DaskPart]) -> _DaskPart: if isinstance(seq[0], np.ndarray): return np.concatenate(seq, axis=0) @@ -561,6 +568,44 @@ def _predict( **kwargs ).values elif isinstance(data, dask_Array): + # for multi-class classification with sparse matrices, pred_contrib predictions + # are returned as a list of sparse matrices (one per class) + if ( + getattr(model, "n_classes_", -1) > 2 + and _is_dask_sparse_array(data) + and pred_contrib + ): + bag = dask_bag_from_sequence(data.partitions) + def _combine_preds(preds_so_far, new_chunk): + #raise RuntimeError(preds_so_far) + for i in range(len(preds_so_far)): + preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]]) + return preds_so_far + + predict_fn = partial( + _predict_part, + model=model, + raw_score=False, + pred_proba=False, + pred_leaf=False, + pred_contrib=True, + ) + + def _predict_part_binop(_ignore, chunk, pred_fn): + return predict_fn(chunk.compute()) + + predict_part_binop = partial( + _predict_part_binop, + pred_fn=predict_fn + ) + + return bag.fold( + binop=predict_part_binop, + combine=_combine_preds, + initial=None + ) + + return data.map_blocks( _predict_part, model=model, diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index bee4c0f872bf..10c0bc93ed51 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -199,7 +199,7 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k dy = da.from_array(y, chunks=chunk_size) dw = da.from_array(weights, chunk_size) else: - raise ValueError("Unknown output type '{output}'") + raise ValueError(f"Unknown output type '{output}'") return X, y, weights, None, dX, dy, dw, None From b90040fab9fae7ed5a974125dc81fa35095dc41d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 12 Jun 2021 23:29:18 -0500 Subject: [PATCH 02/26] adding tests --- tests/python_package_test/test_dask.py | 45 +++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 10c0bc93ed51..16b2e0f52204 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -198,6 +198,7 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix) dy = da.from_array(y, chunks=chunk_size) dw = da.from_array(weights, chunk_size) + X = csr_matrix(X) else: raise ValueError(f"Unknown output type '{output}'") @@ -321,8 +322,10 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' -@pytest.mark.parametrize('output', data_output) -@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) +# @pytest.mark.parametrize('output', data_output) +# @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) +@pytest.mark.parametrize('output', ['scipy_csr_matrix']) +@pytest.mark.parametrize('task', ['multiclass-classification']) def test_classifier_pred_contrib(output, task, cluster): with Client(cluster) as client: X, y, w, _, dX, dy, dw, _ = _create_data( @@ -332,7 +335,8 @@ def test_classifier_pred_contrib(output, task, cluster): params = { "n_estimators": 10, - "num_leaves": 10 + "num_leaves": 50, + "deterministic": True } dask_classifier = lgb.DaskLGBMClassifier( @@ -348,6 +352,33 @@ def test_classifier_pred_contrib(output, task, cluster): local_classifier.fit(X, y, sample_weight=w) local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True) + # shape depends on whether it is binary or multiclass classification + num_features = dask_classifier.n_features_ + num_classes = dask_classifier.n_classes_ + if num_classes == 2: + expected_num_cols = num_features + 1 + else: + expected_num_cols = (num_features + 1) * num_classes + + # in the special case of multi-class classification using scipy sparse matrices, + # the output of `.predict(..., pred_contrib=True)` is a list of sparse matrices (one per class) + # + # since that case is so different than all other cases, check the relevant things here + # and then return early + if output == 'scipy_csr_matrix' and task == 'multiclass-classification': + assert isinstance(preds_with_contrib, list) + assert len(preds_with_contrib) == num_classes + assert len(preds_with_contrib) == len(local_preds_with_contrib) + for i in range(num_classes): + assert preds_with_contrib[i].shape[1] == num_classes + assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape + assert len(np.unique(preds_with_contrib[i][:, -1])) + # raw scores will probably be different, but at least check that all predicted classes are the same + pred_classes = np.argmax(np.array(preds_with_contrib[i].todense()), axis=1) + local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) + assert np.all(pred_classes == local_pred_classes) + return + if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) @@ -363,14 +394,6 @@ def test_classifier_pred_contrib(output, task, cluster): assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' - # shape depends on whether it is binary or multiclass classification - num_features = dask_classifier.n_features_ - num_classes = dask_classifier.n_classes_ - if num_classes == 2: - expected_num_cols = num_features + 1 - else: - expected_num_cols = (num_features + 1) * num_classes - # * shape depends on whether it is binary or multiclass classification # * matrix for binary classification is of the form [feature_contrib, base_value], # for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.] From 2a09151c1aa9c41f941670488032a34e6e1a541c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 13 Jun 2021 00:09:24 -0500 Subject: [PATCH 03/26] docs --- python-package/lightgbm/dask.py | 57 ++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index f84d4f6407ac..2fb8c466c0bf 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -60,12 +60,6 @@ def _find_random_open_port() -> int: return port -def _is_dask_sparse_array(arr: dask_Array) -> bool: - return isinstance( - arr._meta, - (ss.csc.csc_matrix, ss.csr.csr_matrix) - ) - def _concat(seq: List[_DaskPart]) -> _DaskPart: if isinstance(seq[0], np.ndarray): return np.concatenate(seq, axis=0) @@ -515,6 +509,40 @@ def _predict_part( return result +def _combine_preds( + preds_so_far: List[ss.spmatrix], + new_chunk: List[ss.spmatrix] +) -> List[ss.spmatrix]: + """Add a new set of pred_contib predictions to an existing list + + For multi-class classification models, calling ``.predict(X, pred_contrib=True)`` + returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix. + + This function is used to combine the results of multiple such predict calls + executed in parallel on different partitions of data in a Dask Array. + + Parameters + ---------- + preds_so_far : list of sparse matrices + Current list of predictions. + new_chunk : list of sparse matrices + Predictions from an additional chunk of the data, to be appended to ``preds_so_far``. + """ + for i in range(len(preds_so_far)): + preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]]) + return preds_so_far + +def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]: + """Get predictions for one Dask Array + + This function exists for use in ``dask.bag.Bag.fold()``. + The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()`` + starts the folding process from an initial value and expects to be able to pass that + value as the first positional argument to the ``"binop"`` function provided. + """ + return predict_function(chunk.compute()) + + def _predict( model: LGBMModel, data: _DaskMatrixLike, @@ -572,31 +600,24 @@ def _predict( # are returned as a list of sparse matrices (one per class) if ( getattr(model, "n_classes_", -1) > 2 - and _is_dask_sparse_array(data) + and isinstance(data._meta, ss.spmatrix) and pred_contrib ): bag = dask_bag_from_sequence(data.partitions) - def _combine_preds(preds_so_far, new_chunk): - #raise RuntimeError(preds_so_far) - for i in range(len(preds_so_far)): - preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]]) - return preds_so_far - predict_fn = partial( + predict_function = partial( _predict_part, model=model, raw_score=False, pred_proba=False, pred_leaf=False, pred_contrib=True, + **kwargs ) - def _predict_part_binop(_ignore, chunk, pred_fn): - return predict_fn(chunk.compute()) - predict_part_binop = partial( _predict_part_binop, - pred_fn=predict_fn + predict_function=predict_function ) return bag.fold( @@ -810,7 +831,7 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: description="Return the predicted value for each sample.", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes], or (if multiclass and ``pred_contrib=True`` and using sparse inputs) a Dask Bag which evaluates to a list of n_classes sparse matrices", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" ) From 743727480c93cffd31de90f436a7d6ef3caaf890 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 13 Jun 2021 00:11:02 -0500 Subject: [PATCH 04/26] tests --- tests/python_package_test/test_dask.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 16b2e0f52204..095fc2def36d 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -322,10 +322,8 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' -# @pytest.mark.parametrize('output', data_output) -# @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) -@pytest.mark.parametrize('output', ['scipy_csr_matrix']) -@pytest.mark.parametrize('task', ['multiclass-classification']) +@pytest.mark.parametrize('output', data_output) +@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) def test_classifier_pred_contrib(output, task, cluster): with Client(cluster) as client: X, y, w, _, dX, dy, dw, _ = _create_data( From 058188bfe892b88dcdfbf48c2fc13f2a812dcafb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Jun 2021 16:50:36 -0500 Subject: [PATCH 05/26] revert unnecessary changes in tests --- python-package/lightgbm/dask.py | 2 +- tests/python_package_test/test_dask.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 2fb8c466c0bf..d13aaea9acb4 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -513,7 +513,7 @@ def _combine_preds( preds_so_far: List[ss.spmatrix], new_chunk: List[ss.spmatrix] ) -> List[ss.spmatrix]: - """Add a new set of pred_contib predictions to an existing list + """Add a new set of pred_contrib predictions to an existing list For multi-class classification models, calling ``.predict(X, pred_contrib=True)`` returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix. diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 095fc2def36d..8832d52d436e 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -333,8 +333,7 @@ def test_classifier_pred_contrib(output, task, cluster): params = { "n_estimators": 10, - "num_leaves": 50, - "deterministic": True + "num_leaves": 10 } dask_classifier = lgb.DaskLGBMClassifier( From 8e7df9a759317351f1472caebfa6ee02ab32d6d3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Jun 2021 17:00:01 -0500 Subject: [PATCH 06/26] test output type --- tests/python_package_test/test_dask.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 8832d52d436e..da300ad040a9 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -367,6 +367,7 @@ def test_classifier_pred_contrib(output, task, cluster): assert len(preds_with_contrib) == num_classes assert len(preds_with_contrib) == len(local_preds_with_contrib) for i in range(num_classes): + assert isinstance(preds_with_contrib[i], csr_matrix) assert preds_with_contrib[i].shape[1] == num_classes assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape assert len(np.unique(preds_with_contrib[i][:, -1])) From 13b9c3bd8422b59d626607f3e2c0d9e15df2f138 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Jun 2021 21:21:43 -0500 Subject: [PATCH 07/26] linting --- python-package/lightgbm/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index d13aaea9acb4..a265002ca376 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -532,6 +532,7 @@ def _combine_preds( preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]]) return preds_so_far + def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]: """Get predictions for one Dask Array @@ -626,7 +627,6 @@ def _predict( initial=None ) - return data.map_blocks( _predict_part, model=model, From f46da7160d2a3e2db94afa365c171171d1812710 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Jun 2021 21:27:49 -0500 Subject: [PATCH 08/26] linting --- python-package/lightgbm/compat.py | 2 +- python-package/lightgbm/dask.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 931a966c2a06..96ff4d007a84 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -148,7 +148,7 @@ class dask_Array: # type: ignore pass def dask_bag_from_sequence(*args, **kwargs): # type: ignore - """Placeholder for ``dask.bag.from_sequence()``""" + """Dummy function for ``dask.bag.from_sequence()``.""" pass class dask_DataFrame: # type: ignore diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index a265002ca376..f6ebe57144b2 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -513,7 +513,7 @@ def _combine_preds( preds_so_far: List[ss.spmatrix], new_chunk: List[ss.spmatrix] ) -> List[ss.spmatrix]: - """Add a new set of pred_contrib predictions to an existing list + """Add a new set of pred_contrib predictions to an existing list. For multi-class classification models, calling ``.predict(X, pred_contrib=True)`` returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix. @@ -534,7 +534,7 @@ def _combine_preds( def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]: - """Get predictions for one Dask Array + """Get predictions for one Dask Array. This function exists for use in ``dask.bag.Bag.fold()``. The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()`` From e6072bf8a726b9b796cad1523c9939d9a5bb267c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Jun 2021 00:21:04 -0500 Subject: [PATCH 09/26] use from_delayed() instead --- python-package/lightgbm/compat.py | 16 ++++- python-package/lightgbm/dask.py | 96 ++++++++++++-------------- tests/python_package_test/test_dask.py | 20 +++--- 3 files changed, 69 insertions(+), 63 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 96ff4d007a84..f299adc1b6d5 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -125,7 +125,9 @@ class _LGBMRegressorBase: # type: ignore try: from dask import delayed from dask.array import Array as dask_Array - from dask.bag import from_sequence as dask_bag_from_sequence + from dask.array import concatenate as dask_array_concatenate + from dask.array import from_delayed as dask_array_from_delayed + from dask.bag import from_delayed as dask_bag_from_delayed from dask.dataframe import DataFrame as dask_DataFrame from dask.dataframe import Series as dask_Series from dask.distributed import Client, default_client, wait @@ -147,8 +149,16 @@ class dask_Array: # type: ignore pass - def dask_bag_from_sequence(*args, **kwargs): # type: ignore - """Dummy function for ``dask.bag.from_sequence()``.""" + def dask_array_concatenate(*args, **kwargs): # type: ignore + """Dummy function for ``dask.array.concatenate()``.""" + pass + + def dask_array_from_delayed(*args, **kwargs): # type: ignore + """Dummy function for ``dask.array.from_delayed()``.""" + pass + + def dask_bag_from_delayed(*args, **kwargs): # type: ignore + """Dummy function for ``dask.bag.from_delayed()``.""" pass class dask_DataFrame: # type: ignore diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index f6ebe57144b2..a2f97213a535 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -17,8 +17,9 @@ import scipy.sparse as ss from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call -from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_sequence, - dask_Array, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) +from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_delayed, + dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_DataFrame, dask_Series, default_client, + delayed, pd_DataFrame, pd_Series, wait) from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] @@ -509,41 +510,6 @@ def _predict_part( return result -def _combine_preds( - preds_so_far: List[ss.spmatrix], - new_chunk: List[ss.spmatrix] -) -> List[ss.spmatrix]: - """Add a new set of pred_contrib predictions to an existing list. - - For multi-class classification models, calling ``.predict(X, pred_contrib=True)`` - returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix. - - This function is used to combine the results of multiple such predict calls - executed in parallel on different partitions of data in a Dask Array. - - Parameters - ---------- - preds_so_far : list of sparse matrices - Current list of predictions. - new_chunk : list of sparse matrices - Predictions from an additional chunk of the data, to be appended to ``preds_so_far``. - """ - for i in range(len(preds_so_far)): - preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]]) - return preds_so_far - - -def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]: - """Get predictions for one Dask Array. - - This function exists for use in ``dask.bag.Bag.fold()``. - The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()`` - starts the folding process from an initial value and expects to be able to pass that - value as the first positional argument to the ``"binop"`` function provided. - """ - return predict_function(chunk.compute()) - - def _predict( model: LGBMModel, data: _DaskMatrixLike, @@ -553,7 +519,7 @@ def _predict( pred_contrib: bool = False, dtype: _PredictionDtype = np.float32, **kwargs: Any -) -> dask_Array: +) -> Union[dask_Array, List[dask_Array]]: """Inner predict routine. Parameters @@ -581,7 +547,7 @@ def _predict( The predicted values. X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] + X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list of Dask Arrays if ``data`` is sparse and ``pred_contrib=True`` If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): @@ -599,12 +565,13 @@ def _predict( elif isinstance(data, dask_Array): # for multi-class classification with sparse matrices, pred_contrib predictions # are returned as a list of sparse matrices (one per class) + num_classes = getattr(model, "n_classes_", -1) + if ( - getattr(model, "n_classes_", -1) > 2 + num_classes > 2 and isinstance(data._meta, ss.spmatrix) and pred_contrib ): - bag = dask_bag_from_sequence(data.partitions) predict_function = partial( _predict_part, @@ -616,16 +583,41 @@ def _predict( **kwargs ) - predict_part_binop = partial( - _predict_part_binop, - predict_function=predict_function - ) + delayed_chunks = data.to_delayed() + bag = dask_bag_from_delayed(delayed_chunks[:, 0]) - return bag.fold( - binop=predict_part_binop, - combine=_combine_preds, - initial=None - ) + @delayed + def _extract(items: List[Any], i: int) -> Any: + return items[i] + + preds = bag.map_partitions(predict_function) + + # pred_contrib output will have one column per feature, + # plus one more for the base value + num_cols = model.n_features_ + 1 + + nrows_per_chunk = data.chunks[0] + out = [list() for _ in range(num_classes)] + + # need to tell Dask the expected type and shape of individual preds + pred_meta_shape = (0, num_cols) + pred_meta = ss.csr_matrix(pred_meta_shape) + + for j, partition in enumerate(preds.to_delayed()): + for i in range(num_classes): + part = dask_array_from_delayed( + value=_extract(partition, i), + shape=(nrows_per_chunk[j], num_cols), + meta=pred_meta + ) + out[i].append(part) + + # At this point, `out` is a list of lists of delayeds (each of which points to a matrix). + # Concatenate them to return a list of Dask Arrays. + for i in range(num_classes): + out[i] = dask_array_concatenate(out[i]).map_blocks(ss.csr_matrix) + + return out return data.map_blocks( _predict_part, @@ -831,9 +823,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: description="Return the predicted value for each sample.", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes], or (if multiclass and ``pred_contrib=True`` and using sparse inputs) a Dask Bag which evaluates to a list of n_classes sparse matrices", + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]" ) def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index da300ad040a9..55d072052c67 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -13,8 +13,8 @@ import lightgbm as lgb -if not platform.startswith('linux'): - pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) +# if not platform.startswith('linux'): +# pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) if not lgb.compat.DASK_INSTALLED: pytest.skip('Dask is not installed', allow_module_level=True) @@ -343,7 +343,7 @@ def test_classifier_pred_contrib(output, task, cluster): **params ) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) - preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute() + preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True) local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) @@ -364,19 +364,23 @@ def test_classifier_pred_contrib(output, task, cluster): # and then return early if output == 'scipy_csr_matrix' and task == 'multiclass-classification': assert isinstance(preds_with_contrib, list) + assert all(isinstance(arr, da.Array) for arr in preds_with_contrib) + assert all(isinstance(arr._meta, csr_matrix) for arr in preds_with_contrib) assert len(preds_with_contrib) == num_classes assert len(preds_with_contrib) == len(local_preds_with_contrib) for i in range(num_classes): - assert isinstance(preds_with_contrib[i], csr_matrix) - assert preds_with_contrib[i].shape[1] == num_classes - assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape - assert len(np.unique(preds_with_contrib[i][:, -1])) + computed_preds = preds_with_contrib[i].compute() + assert isinstance(computed_preds, csr_matrix) + assert computed_preds.shape[1] == num_classes + assert computed_preds.shape == local_preds_with_contrib[i].shape + assert len(np.unique(computed_preds[:, -1])) == 1 # raw scores will probably be different, but at least check that all predicted classes are the same - pred_classes = np.argmax(np.array(preds_with_contrib[i].todense()), axis=1) + pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1) local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) assert np.all(pred_classes == local_pred_classes) return + preds_with_contrib = preds_with_contrib.compute() if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) From 8378ceec56431712a608667f3260524781386e91 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Jun 2021 00:26:27 -0500 Subject: [PATCH 10/26] docstring pycodestyle is happy with --- python-package/lightgbm/compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index f299adc1b6d5..51d15a616f44 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -150,15 +150,15 @@ class dask_Array: # type: ignore pass def dask_array_concatenate(*args, **kwargs): # type: ignore - """Dummy function for ``dask.array.concatenate()``.""" + """Mock ``dask.array.concatenate()``.""" pass def dask_array_from_delayed(*args, **kwargs): # type: ignore - """Dummy function for ``dask.array.from_delayed()``.""" + """Mock function for ``dask.array.from_delayed()``.""" pass def dask_bag_from_delayed(*args, **kwargs): # type: ignore - """Dummy function for ``dask.bag.from_delayed()``.""" + """Mock function for ``dask.bag.from_delayed()``.""" pass class dask_DataFrame: # type: ignore From f86220d433541d3b05e725eec36913118e9fdc49 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Jun 2021 00:31:59 -0500 Subject: [PATCH 11/26] isort --- python-package/lightgbm/dask.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index a2f97213a535..cd781822e3ad 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -17,9 +17,9 @@ import scipy.sparse as ss from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call -from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_delayed, - dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_DataFrame, dask_Series, default_client, - delayed, pd_DataFrame, pd_Series, wait) +from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, + dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, + dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] From a6557a8e5182cabe4746deeb5f697d351d8fdf58 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Jun 2021 16:48:41 -0500 Subject: [PATCH 12/26] put pytest skips back --- tests/python_package_test/test_dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 55d072052c67..e8f17d1411be 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -13,8 +13,8 @@ import lightgbm as lgb -# if not platform.startswith('linux'): -# pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) +if not platform.startswith('linux'): + pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) if not lgb.compat.DASK_INSTALLED: pytest.skip('Dask is not installed', allow_module_level=True) From 45ebf7da72ce47ff9e3170f1594641946c8f3e68 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 22 Jun 2021 17:11:33 -0500 Subject: [PATCH 13/26] respect sparse return type --- python-package/lightgbm/dask.py | 19 ++++++++++++++++--- tests/python_package_test/test_dask.py | 23 +++++++++++++++++------ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 5e1ab414fd28..88cefb5ddfb5 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -601,8 +601,7 @@ def _extract(items: List[Any], i: int) -> Any: out = [list() for _ in range(num_classes)] # need to tell Dask the expected type and shape of individual preds - pred_meta_shape = (0, num_cols) - pred_meta = ss.csr_matrix(pred_meta_shape) + pred_meta = data._meta for j, partition in enumerate(preds.to_delayed()): for i in range(num_classes): @@ -613,10 +612,24 @@ def _extract(items: List[Any], i: int) -> Any: ) out[i].append(part) + # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix + # + # the code below is used instead to ensure that the sparse type is preserved during concatentation + if isinstance(pred_meta, ss.csr_matrix): + concat_fn = partial(ss.vstack, format='csr') + elif isinstance(pred_meta, ss.csc_matrix): + concat_fn = partial(ss.vstack, format='csc') + else: + concat_fn = ss.vstack + # At this point, `out` is a list of lists of delayeds (each of which points to a matrix). # Concatenate them to return a list of Dask Arrays. for i in range(num_classes): - out[i] = dask_array_concatenate(out[i]).map_blocks(ss.csr_matrix) + out[i] = dask_array_from_delayed( + value=delayed(concat_fn)(out[i]), + shape=(data.shape[0], num_cols), + meta=pred_meta + ) return out diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index e8f17d1411be..7b32d275b7e2 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -28,7 +28,7 @@ from dask.array.utils import assert_eq from dask.distributed import Client, LocalCluster, default_client, wait from pkg_resources import parse_version -from scipy.sparse import csr_matrix +from scipy.sparse import csc_matrix, csr_matrix from scipy.stats import spearmanr from sklearn import __version__ as sk_version from sklearn.datasets import make_blobs, make_regression @@ -199,6 +199,11 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k dy = da.from_array(y, chunks=chunk_size) dw = da.from_array(weights, chunk_size) X = csr_matrix(X) + elif output == 'scipy_csc_matrix': + dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix) + dy = da.from_array(y, chunks=chunk_size) + dw = da.from_array(weights, chunk_size) + X = csc_matrix(X) else: raise ValueError(f"Unknown output type '{output}'") @@ -322,7 +327,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' -@pytest.mark.parametrize('output', data_output) +@pytest.mark.parametrize('output', data_output + ['scipy_csc_matrix']) @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) def test_classifier_pred_contrib(output, task, cluster): with Client(cluster) as client: @@ -362,15 +367,21 @@ def test_classifier_pred_contrib(output, task, cluster): # # since that case is so different than all other cases, check the relevant things here # and then return early - if output == 'scipy_csr_matrix' and task == 'multiclass-classification': + if output.startswith('scipy') and task == 'multiclass-classification': + if output == 'scipy_csr_matrix': + expected_type = csr_matrix + elif output == 'scipy_csc_matrix': + expected_type = csc_matrix + else: + raise ValueError(f"Unrecognized output type: {output}") assert isinstance(preds_with_contrib, list) assert all(isinstance(arr, da.Array) for arr in preds_with_contrib) - assert all(isinstance(arr._meta, csr_matrix) for arr in preds_with_contrib) + assert all(isinstance(arr._meta, expected_type) for arr in preds_with_contrib) assert len(preds_with_contrib) == num_classes assert len(preds_with_contrib) == len(local_preds_with_contrib) for i in range(num_classes): computed_preds = preds_with_contrib[i].compute() - assert isinstance(computed_preds, csr_matrix) + assert isinstance(computed_preds, expected_type) assert computed_preds.shape[1] == num_classes assert computed_preds.shape == local_preds_with_contrib[i].shape assert len(np.unique(computed_preds[:, -1])) == 1 @@ -381,7 +392,7 @@ def test_classifier_pred_contrib(output, task, cluster): return preds_with_contrib = preds_with_contrib.compute() - if output == 'scipy_csr_matrix': + if output.startswith('scipy'): preds_with_contrib = np.array(preds_with_contrib.todense()) # be sure LightGBM actually used at least one categorical column, From 585fef4117fa8c773f6f584f90c831a999f49321 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 22 Jun 2021 17:24:20 -0500 Subject: [PATCH 14/26] fix doc --- python-package/lightgbm/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 88cefb5ddfb5..cdab892128a6 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -548,7 +548,7 @@ def _predict( The predicted values. X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list of Dask Arrays if ``data`` is sparse and ``pred_contrib=True`` + X_SHAP_values : "Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]" If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): From c5a0483c29bc7718bc0cefbfdd3337bb118625f0 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 22 Jun 2021 22:29:31 -0500 Subject: [PATCH 15/26] remove unnecessary dask_array_concatenate() --- python-package/lightgbm/compat.py | 5 ----- python-package/lightgbm/dask.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 51d15a616f44..a596c1f03f16 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -125,7 +125,6 @@ class _LGBMRegressorBase: # type: ignore try: from dask import delayed from dask.array import Array as dask_Array - from dask.array import concatenate as dask_array_concatenate from dask.array import from_delayed as dask_array_from_delayed from dask.bag import from_delayed as dask_bag_from_delayed from dask.dataframe import DataFrame as dask_DataFrame @@ -149,10 +148,6 @@ class dask_Array: # type: ignore pass - def dask_array_concatenate(*args, **kwargs): # type: ignore - """Mock ``dask.array.concatenate()``.""" - pass - def dask_array_from_delayed(*args, **kwargs): # type: ignore """Mock function for ``dask.array.from_delayed()``.""" pass diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index cdab892128a6..eef3aaebd5c2 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -18,8 +18,8 @@ from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, - dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, - dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) + dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, + default_client, delayed, pd_DataFrame, pd_Series, wait) from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] From 336195045c66f0a22954af25180ccef41eaae5b6 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 3 Jul 2021 23:06:53 -0500 Subject: [PATCH 16/26] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/compat.py | 4 ++-- python-package/lightgbm/dask.py | 5 ++--- tests/python_package_test/test_dask.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index a596c1f03f16..669969b79825 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -149,11 +149,11 @@ class dask_Array: # type: ignore pass def dask_array_from_delayed(*args, **kwargs): # type: ignore - """Mock function for ``dask.array.from_delayed()``.""" + """Mock function for dask.array.from_delayed().""" pass def dask_bag_from_delayed(*args, **kwargs): # type: ignore - """Mock function for ``dask.bag.from_delayed()``.""" + """Mock function for dask.bag.from_delayed().""" pass class dask_DataFrame: # type: ignore diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 2904bf2851a2..c3da0ea35c8d 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -890,12 +890,12 @@ def _predict( elif isinstance(data, dask_Array): # for multi-class classification with sparse matrices, pred_contrib predictions # are returned as a list of sparse matrices (one per class) - num_classes = getattr(model, "n_classes_", -1) + num_classes = model._n_classes or -1 if ( num_classes > 2 - and isinstance(data._meta, ss.spmatrix) and pred_contrib + and isinstance(data._meta, ss.spmatrix) ): predict_function = partial( @@ -937,7 +937,6 @@ def _extract(items: List[Any], i: int) -> Any: out[i].append(part) # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix - # # the code below is used instead to ensure that the sparse type is preserved during concatentation if isinstance(pred_meta, ss.csr_matrix): concat_fn = partial(ss.vstack, format='csr') diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index e9f89f1750d0..6b70c81620e3 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -411,7 +411,7 @@ def test_classifier_pred_contrib(output, task, cluster): # raw scores will probably be different, but at least check that all predicted classes are the same pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1) local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) - assert np.all(pred_classes == local_pred_classes) + np.testing.assert_array_equal(pred_classes, local_pred_classes) return preds_with_contrib = preds_with_contrib.compute() From 77661210d45930b77e0e7d8da46908edd06a5c2e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 11:42:51 -0500 Subject: [PATCH 17/26] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/dask.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index c3da0ea35c8d..3a42d4a4b95d 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -872,7 +872,7 @@ def _predict( The predicted values. X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : "Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]" + X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1] If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): @@ -922,7 +922,7 @@ def _extract(items: List[Any], i: int) -> Any: num_cols = model.n_features_ + 1 nrows_per_chunk = data.chunks[0] - out = [list() for _ in range(num_classes)] + out = [[] for _ in range(num_classes)] # need to tell Dask the expected type and shape of individual preds pred_meta = data._meta @@ -1210,7 +1210,7 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: output_name="predicted_result", predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" ) def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: From 810a0b8e11b9ebe2717d9146a29a5be8aa57721c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 11:44:35 -0500 Subject: [PATCH 18/26] update predict_proba() docstring --- python-package/lightgbm/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 1ce5b5c64164..40ede4980dc1 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1228,7 +1228,7 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: output_name="predicted_probability", predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" ) def to_local(self) -> LGBMClassifier: From f79ab3c51cea475f97b257855aa8d9bf057685a9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 11:52:46 -0500 Subject: [PATCH 19/26] remove unnecessary np.array() --- tests/python_package_test/test_dask.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index f3a6e27f6afc..21fb445a872c 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster): assert computed_preds.shape == local_preds_with_contrib[i].shape assert len(np.unique(computed_preds[:, -1])) == 1 # raw scores will probably be different, but at least check that all predicted classes are the same - pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1) - local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) + pred_classes = np.argmax(computed_preds.todense(), axis=1) + local_pred_classes = np.argmax(local_preds_with_contrib[i].todense(), axis=1) np.testing.assert_array_equal(pred_classes, local_pred_classes) return preds_with_contrib = preds_with_contrib.compute() if output.startswith('scipy'): - preds_with_contrib = np.array(preds_with_contrib.todense()) + preds_with_contrib = preds_with_contrib.todense() # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature @@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster): local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": - preds_with_contrib = np.array(preds_with_contrib.todense()) + preds_with_contrib = preds_with_contrib.todense() # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position From 6a7da56df4b770dbe487171cb519e305e214df83 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 11:54:40 -0500 Subject: [PATCH 20/26] Update python-package/lightgbm/dask.py Co-authored-by: Nikita Titov --- python-package/lightgbm/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 40ede4980dc1..83c9280c4ca7 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -902,7 +902,7 @@ def _predict( _predict_part, model=model, raw_score=False, - pred_proba=False, + pred_proba=pred_proba, pred_leaf=False, pred_contrib=True, **kwargs From 95ae45f879942191f94e1fceaab325c6bf737e21 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 15:18:15 -0500 Subject: [PATCH 21/26] fix assertion --- tests/python_package_test/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 21fb445a872c..68117aac5cb4 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster): assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: - assert len(np.unique(preds_with_contrib[:, num_features]) == 1) + assert len(np.unique(preds_with_contrib[:, num_features])) == 1 else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i From a7d6d374ee58e7f7f2f63ba638e0a2f80668d4a7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 17:03:15 -0500 Subject: [PATCH 22/26] fix test use of len() --- tests/python_package_test/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 68117aac5cb4..455cbb8bbb92 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster): assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: - assert len(np.unique(preds_with_contrib[:, num_features])) == 1 + assert np.unique(preds_with_contrib[:, num_features]).shape[0] == 1 else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i From 831927bb9fb501a71d5a93558c6a7c2dda20770f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 4 Jul 2021 23:37:46 -0500 Subject: [PATCH 23/26] restore np.array() in tests --- tests/python_package_test/test_dask.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 455cbb8bbb92..28905fa684f8 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster): assert computed_preds.shape == local_preds_with_contrib[i].shape assert len(np.unique(computed_preds[:, -1])) == 1 # raw scores will probably be different, but at least check that all predicted classes are the same - pred_classes = np.argmax(computed_preds.todense(), axis=1) - local_pred_classes = np.argmax(local_preds_with_contrib[i].todense(), axis=1) + pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1) + local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) np.testing.assert_array_equal(pred_classes, local_pred_classes) return preds_with_contrib = preds_with_contrib.compute() if output.startswith('scipy'): - preds_with_contrib = preds_with_contrib.todense() + preds_with_contrib = np.array(preds_with_contrib.todense()) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature @@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster): local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": - preds_with_contrib = preds_with_contrib.todense() + preds_with_contrib = np.array(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position From 61f31dd0d79ac950379fb1b52e45afbbdd43e65e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 5 Jul 2021 00:01:35 -0500 Subject: [PATCH 24/26] use np.asarray() instead --- tests/python_package_test/test_dask.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 28905fa684f8..186882a22117 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster): assert computed_preds.shape == local_preds_with_contrib[i].shape assert len(np.unique(computed_preds[:, -1])) == 1 # raw scores will probably be different, but at least check that all predicted classes are the same - pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1) - local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1) + pred_classes = np.argmax(np.asarray(computed_preds.todense()), axis=1) + local_pred_classes = np.argmax(np.asarray(local_preds_with_contrib[i].todense()), axis=1) np.testing.assert_array_equal(pred_classes, local_pred_classes) return preds_with_contrib = preds_with_contrib.compute() if output.startswith('scipy'): - preds_with_contrib = np.array(preds_with_contrib.todense()) + preds_with_contrib = np.asarray(preds_with_contrib.todense()) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature @@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster): local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": - preds_with_contrib = np.array(preds_with_contrib.todense()) + preds_with_contrib = np.asarray(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position From 57edeaa8c95648d990b8aa2e20fc43d3055f7e8e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 5 Jul 2021 18:54:48 -0500 Subject: [PATCH 25/26] use toarray() --- tests/python_package_test/test_dask.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 186882a22117..2c0d4089c990 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster): assert computed_preds.shape == local_preds_with_contrib[i].shape assert len(np.unique(computed_preds[:, -1])) == 1 # raw scores will probably be different, but at least check that all predicted classes are the same - pred_classes = np.argmax(np.asarray(computed_preds.todense()), axis=1) - local_pred_classes = np.argmax(np.asarray(local_preds_with_contrib[i].todense()), axis=1) + pred_classes = np.argmax(computed_preds.toarray(), axis=1) + local_pred_classes = np.argmax(local_preds_with_contrib[i].toarray(), axis=1) np.testing.assert_array_equal(pred_classes, local_pred_classes) return preds_with_contrib = preds_with_contrib.compute() if output.startswith('scipy'): - preds_with_contrib = np.asarray(preds_with_contrib.todense()) + preds_with_contrib = preds_with_contrib.toarray() # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature @@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster): assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: - assert np.unique(preds_with_contrib[:, num_features]).shape[0] == 1 + assert len(np.unique(preds_with_contrib[:, num_features])) == 1 else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i @@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster): local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": - preds_with_contrib = np.asarray(preds_with_contrib.todense()) + preds_with_contrib = preds_with_contrib.toarray() # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position From e4fed6f1300983993babe32111325ca98c448e5d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 6 Jul 2021 17:32:09 -0500 Subject: [PATCH 26/26] remove empty functions in compat --- python-package/lightgbm/compat.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 669969b79825..52726622f076 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -134,6 +134,8 @@ class _LGBMRegressorBase: # type: ignore except ImportError: DASK_INSTALLED = False + dask_array_from_delayed = None + dask_bag_from_delayed = None delayed = None default_client = None wait = None @@ -148,14 +150,6 @@ class dask_Array: # type: ignore pass - def dask_array_from_delayed(*args, **kwargs): # type: ignore - """Mock function for dask.array.from_delayed().""" - pass - - def dask_bag_from_delayed(*args, **kwargs): # type: ignore - """Mock function for dask.bag.from_delayed().""" - pass - class dask_DataFrame: # type: ignore """Dummy class for dask.dataframe.DataFrame."""