From a397d23f360c45e2f132848f6dbc0a0c57012386 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 11 Jun 2021 13:26:31 -0500
Subject: [PATCH 01/26] test_classifier working

---
 python-package/lightgbm/compat.py      |  5 +++
 python-package/lightgbm/dask.py        | 47 +++++++++++++++++++++++++-
 tests/python_package_test/test_dask.py |  2 +-
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 068a667b2603..931a966c2a06 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -125,6 +125,7 @@ class _LGBMRegressorBase:  # type: ignore
 try:
     from dask import delayed
     from dask.array import Array as dask_Array
+    from dask.bag import from_sequence as dask_bag_from_sequence
     from dask.dataframe import DataFrame as dask_DataFrame
     from dask.dataframe import Series as dask_Series
     from dask.distributed import Client, default_client, wait
@@ -146,6 +147,10 @@ class dask_Array:  # type: ignore
 
         pass
 
+    def dask_bag_from_sequence(*args, **kwargs):  # type: ignore
+        """Placeholder for ``dask.bag.from_sequence()``"""
+        pass
+
     class dask_DataFrame:  # type: ignore
         """Dummy class for dask.dataframe.DataFrame."""
 
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 8dd96e980161..f84d4f6407ac 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -9,6 +9,7 @@
 import socket
 from collections import defaultdict
 from copy import deepcopy
+from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Type, Union
 from urllib.parse import urlparse
 
@@ -16,7 +17,7 @@
 import scipy.sparse as ss
 
 from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call
-from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat,
+from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_sequence,
                      dask_Array, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait)
 from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict
 
@@ -59,6 +60,12 @@ def _find_random_open_port() -> int:
     return port
 
 
+def _is_dask_sparse_array(arr: dask_Array) -> bool:
+    return isinstance(
+        arr._meta,
+        (ss.csc.csc_matrix, ss.csr.csr_matrix)
+    )
+
 def _concat(seq: List[_DaskPart]) -> _DaskPart:
     if isinstance(seq[0], np.ndarray):
         return np.concatenate(seq, axis=0)
@@ -561,6 +568,44 @@ def _predict(
             **kwargs
         ).values
     elif isinstance(data, dask_Array):
+        # for multi-class classification with sparse matrices, pred_contrib predictions
+        # are returned as a list of sparse matrices (one per class)
+        if (
+            getattr(model, "n_classes_", -1) > 2
+            and _is_dask_sparse_array(data)
+            and pred_contrib
+        ):
+            bag = dask_bag_from_sequence(data.partitions)
+            def _combine_preds(preds_so_far, new_chunk):
+                #raise RuntimeError(preds_so_far)
+                for i in range(len(preds_so_far)):
+                    preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]])
+                return preds_so_far
+
+            predict_fn = partial(
+                _predict_part,
+                model=model,
+                raw_score=False,
+                pred_proba=False,
+                pred_leaf=False,
+                pred_contrib=True,
+            )
+
+            def _predict_part_binop(_ignore, chunk, pred_fn):
+                return predict_fn(chunk.compute())
+
+            predict_part_binop = partial(
+                _predict_part_binop,
+                pred_fn=predict_fn
+            )
+
+            return bag.fold(
+                binop=predict_part_binop,
+                combine=_combine_preds,
+                initial=None
+            )
+
+
         return data.map_blocks(
             _predict_part,
             model=model,
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index bee4c0f872bf..10c0bc93ed51 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -199,7 +199,7 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k
         dy = da.from_array(y, chunks=chunk_size)
         dw = da.from_array(weights, chunk_size)
     else:
-        raise ValueError("Unknown output type '{output}'")
+        raise ValueError(f"Unknown output type '{output}'")
 
     return X, y, weights, None, dX, dy, dw, None
 

From b90040fab9fae7ed5a974125dc81fa35095dc41d Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 12 Jun 2021 23:29:18 -0500
Subject: [PATCH 02/26] adding tests

---
 tests/python_package_test/test_dask.py | 45 +++++++++++++++++++-------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 10c0bc93ed51..16b2e0f52204 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -198,6 +198,7 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k
         dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix)
         dy = da.from_array(y, chunks=chunk_size)
         dw = da.from_array(weights, chunk_size)
+        X = csr_matrix(X)
     else:
         raise ValueError(f"Unknown output type '{output}'")
 
@@ -321,8 +322,10 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
 
-@pytest.mark.parametrize('output', data_output)
-@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
+# @pytest.mark.parametrize('output', data_output)
+# @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
+@pytest.mark.parametrize('output', ['scipy_csr_matrix'])
+@pytest.mark.parametrize('task', ['multiclass-classification'])
 def test_classifier_pred_contrib(output, task, cluster):
     with Client(cluster) as client:
         X, y, w, _, dX, dy, dw, _ = _create_data(
@@ -332,7 +335,8 @@ def test_classifier_pred_contrib(output, task, cluster):
 
         params = {
             "n_estimators": 10,
-            "num_leaves": 10
+            "num_leaves": 50,
+            "deterministic": True
         }
 
         dask_classifier = lgb.DaskLGBMClassifier(
@@ -348,6 +352,33 @@ def test_classifier_pred_contrib(output, task, cluster):
         local_classifier.fit(X, y, sample_weight=w)
         local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True)
 
+        # shape depends on whether it is binary or multiclass classification
+        num_features = dask_classifier.n_features_
+        num_classes = dask_classifier.n_classes_
+        if num_classes == 2:
+            expected_num_cols = num_features + 1
+        else:
+            expected_num_cols = (num_features + 1) * num_classes
+
+        # in the special case of multi-class classification using scipy sparse matrices,
+        # the output of `.predict(..., pred_contrib=True)` is a list of sparse matrices (one per class)
+        #
+        # since that case is so different than all other cases, check the relevant things here
+        # and then return early
+        if output == 'scipy_csr_matrix' and task == 'multiclass-classification':
+            assert isinstance(preds_with_contrib, list)
+            assert len(preds_with_contrib) == num_classes
+            assert len(preds_with_contrib) == len(local_preds_with_contrib)
+            for i in range(num_classes):
+                assert preds_with_contrib[i].shape[1] == num_classes
+                assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape
+                assert len(np.unique(preds_with_contrib[i][:, -1]))
+                # raw scores will probably be different, but at least check that all predicted classes are the same
+                pred_classes = np.argmax(np.array(preds_with_contrib[i].todense()), axis=1)
+                local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
+                assert np.all(pred_classes == local_pred_classes)
+            return
+
         if output == 'scipy_csr_matrix':
             preds_with_contrib = np.array(preds_with_contrib.todense())
 
@@ -363,14 +394,6 @@ def test_classifier_pred_contrib(output, task, cluster):
             assert node_uses_cat_col.sum() > 0
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
-        # shape depends on whether it is binary or multiclass classification
-        num_features = dask_classifier.n_features_
-        num_classes = dask_classifier.n_classes_
-        if num_classes == 2:
-            expected_num_cols = num_features + 1
-        else:
-            expected_num_cols = (num_features + 1) * num_classes
-
         # * shape depends on whether it is binary or multiclass classification
         # * matrix for binary classification is of the form [feature_contrib, base_value],
         #   for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.]

From 2a09151c1aa9c41f941670488032a34e6e1a541c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 13 Jun 2021 00:09:24 -0500
Subject: [PATCH 03/26] docs

---
 python-package/lightgbm/dask.py | 57 ++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index f84d4f6407ac..2fb8c466c0bf 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -60,12 +60,6 @@ def _find_random_open_port() -> int:
     return port
 
 
-def _is_dask_sparse_array(arr: dask_Array) -> bool:
-    return isinstance(
-        arr._meta,
-        (ss.csc.csc_matrix, ss.csr.csr_matrix)
-    )
-
 def _concat(seq: List[_DaskPart]) -> _DaskPart:
     if isinstance(seq[0], np.ndarray):
         return np.concatenate(seq, axis=0)
@@ -515,6 +509,40 @@ def _predict_part(
     return result
 
 
+def _combine_preds(
+    preds_so_far: List[ss.spmatrix],
+    new_chunk: List[ss.spmatrix]
+) -> List[ss.spmatrix]:
+    """Add a new set of pred_contib predictions to an existing list
+
+    For multi-class classification models, calling ``.predict(X, pred_contrib=True)``
+    returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix.
+
+    This function is used to combine the results of multiple such predict calls
+    executed in parallel on different partitions of data in a Dask Array.
+
+    Parameters
+    ----------
+    preds_so_far : list of sparse matrices
+        Current list of predictions.
+    new_chunk : list of sparse matrices
+        Predictions from an additional chunk of the data, to be appended to ``preds_so_far``.
+    """
+    for i in range(len(preds_so_far)):
+        preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]])
+    return preds_so_far
+
+def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]:
+    """Get predictions for one Dask Array
+
+    This function exists for use in ``dask.bag.Bag.fold()``.
+    The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()``
+    starts the folding process from an initial value and expects to be able to pass that
+    value as the first positional argument to the ``"binop"`` function provided.
+    """
+    return predict_function(chunk.compute())
+
+
 def _predict(
     model: LGBMModel,
     data: _DaskMatrixLike,
@@ -572,31 +600,24 @@ def _predict(
         # are returned as a list of sparse matrices (one per class)
         if (
             getattr(model, "n_classes_", -1) > 2
-            and _is_dask_sparse_array(data)
+            and isinstance(data._meta, ss.spmatrix)
             and pred_contrib
         ):
             bag = dask_bag_from_sequence(data.partitions)
-            def _combine_preds(preds_so_far, new_chunk):
-                #raise RuntimeError(preds_so_far)
-                for i in range(len(preds_so_far)):
-                    preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]])
-                return preds_so_far
 
-            predict_fn = partial(
+            predict_function = partial(
                 _predict_part,
                 model=model,
                 raw_score=False,
                 pred_proba=False,
                 pred_leaf=False,
                 pred_contrib=True,
+                **kwargs
             )
 
-            def _predict_part_binop(_ignore, chunk, pred_fn):
-                return predict_fn(chunk.compute())
-
             predict_part_binop = partial(
                 _predict_part_binop,
-                pred_fn=predict_fn
+                predict_function=predict_function
             )
 
             return bag.fold(
@@ -810,7 +831,7 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         description="Return the predicted value for each sample.",
         X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
         output_name="predicted_result",
-        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
+        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes], or (if multiclass and ``pred_contrib=True`` and using sparse inputs) a Dask Bag which evaluates to a list of n_classes sparse matrices",
         X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
         X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
     )

From 743727480c93cffd31de90f436a7d6ef3caaf890 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 13 Jun 2021 00:11:02 -0500
Subject: [PATCH 04/26] tests

---
 tests/python_package_test/test_dask.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 16b2e0f52204..095fc2def36d 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -322,10 +322,8 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
 
-# @pytest.mark.parametrize('output', data_output)
-# @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
-@pytest.mark.parametrize('output', ['scipy_csr_matrix'])
-@pytest.mark.parametrize('task', ['multiclass-classification'])
+@pytest.mark.parametrize('output', data_output)
+@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
 def test_classifier_pred_contrib(output, task, cluster):
     with Client(cluster) as client:
         X, y, w, _, dX, dy, dw, _ = _create_data(

From 058188bfe892b88dcdfbf48c2fc13f2a812dcafb Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Jun 2021 16:50:36 -0500
Subject: [PATCH 05/26] revert unnecessary changes in tests

---
 python-package/lightgbm/dask.py        | 2 +-
 tests/python_package_test/test_dask.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 2fb8c466c0bf..d13aaea9acb4 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -513,7 +513,7 @@ def _combine_preds(
     preds_so_far: List[ss.spmatrix],
     new_chunk: List[ss.spmatrix]
 ) -> List[ss.spmatrix]:
-    """Add a new set of pred_contib predictions to an existing list
+    """Add a new set of pred_contrib predictions to an existing list
 
     For multi-class classification models, calling ``.predict(X, pred_contrib=True)``
     returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix.
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 095fc2def36d..8832d52d436e 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -333,8 +333,7 @@ def test_classifier_pred_contrib(output, task, cluster):
 
         params = {
             "n_estimators": 10,
-            "num_leaves": 50,
-            "deterministic": True
+            "num_leaves": 10
         }
 
         dask_classifier = lgb.DaskLGBMClassifier(

From 8e7df9a759317351f1472caebfa6ee02ab32d6d3 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Jun 2021 17:00:01 -0500
Subject: [PATCH 06/26] test output type

---
 tests/python_package_test/test_dask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 8832d52d436e..da300ad040a9 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -367,6 +367,7 @@ def test_classifier_pred_contrib(output, task, cluster):
             assert len(preds_with_contrib) == num_classes
             assert len(preds_with_contrib) == len(local_preds_with_contrib)
             for i in range(num_classes):
+                assert isinstance(preds_with_contrib[i], csr_matrix)
                 assert preds_with_contrib[i].shape[1] == num_classes
                 assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(preds_with_contrib[i][:, -1]))

From 13b9c3bd8422b59d626607f3e2c0d9e15df2f138 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Jun 2021 21:21:43 -0500
Subject: [PATCH 07/26] linting

---
 python-package/lightgbm/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index d13aaea9acb4..a265002ca376 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -532,6 +532,7 @@ def _combine_preds(
         preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]])
     return preds_so_far
 
+
 def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]:
     """Get predictions for one Dask Array
 
@@ -626,7 +627,6 @@ def _predict(
                 initial=None
             )
 
-
         return data.map_blocks(
             _predict_part,
             model=model,

From f46da7160d2a3e2db94afa365c171171d1812710 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Jun 2021 21:27:49 -0500
Subject: [PATCH 08/26] linting

---
 python-package/lightgbm/compat.py | 2 +-
 python-package/lightgbm/dask.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 931a966c2a06..96ff4d007a84 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -148,7 +148,7 @@ class dask_Array:  # type: ignore
         pass
 
     def dask_bag_from_sequence(*args, **kwargs):  # type: ignore
-        """Placeholder for ``dask.bag.from_sequence()``"""
+        """Dummy function for ``dask.bag.from_sequence()``."""
         pass
 
     class dask_DataFrame:  # type: ignore
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index a265002ca376..f6ebe57144b2 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -513,7 +513,7 @@ def _combine_preds(
     preds_so_far: List[ss.spmatrix],
     new_chunk: List[ss.spmatrix]
 ) -> List[ss.spmatrix]:
-    """Add a new set of pred_contrib predictions to an existing list
+    """Add a new set of pred_contrib predictions to an existing list.
 
     For multi-class classification models, calling ``.predict(X, pred_contrib=True)``
     returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix.
@@ -534,7 +534,7 @@ def _combine_preds(
 
 
 def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]:
-    """Get predictions for one Dask Array
+    """Get predictions for one Dask Array.
 
     This function exists for use in ``dask.bag.Bag.fold()``.
     The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()``

From e6072bf8a726b9b796cad1523c9939d9a5bb267c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 16 Jun 2021 00:21:04 -0500
Subject: [PATCH 09/26] use from_delayed() instead

---
 python-package/lightgbm/compat.py      | 16 ++++-
 python-package/lightgbm/dask.py        | 96 ++++++++++++--------------
 tests/python_package_test/test_dask.py | 20 +++---
 3 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 96ff4d007a84..f299adc1b6d5 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -125,7 +125,9 @@ class _LGBMRegressorBase:  # type: ignore
 try:
     from dask import delayed
     from dask.array import Array as dask_Array
-    from dask.bag import from_sequence as dask_bag_from_sequence
+    from dask.array import concatenate as dask_array_concatenate
+    from dask.array import from_delayed as dask_array_from_delayed
+    from dask.bag import from_delayed as dask_bag_from_delayed
     from dask.dataframe import DataFrame as dask_DataFrame
     from dask.dataframe import Series as dask_Series
     from dask.distributed import Client, default_client, wait
@@ -147,8 +149,16 @@ class dask_Array:  # type: ignore
 
         pass
 
-    def dask_bag_from_sequence(*args, **kwargs):  # type: ignore
-        """Dummy function for ``dask.bag.from_sequence()``."""
+    def dask_array_concatenate(*args, **kwargs):  # type: ignore
+        """Dummy function for ``dask.array.concatenate()``."""
+        pass
+
+    def dask_array_from_delayed(*args, **kwargs):  # type: ignore
+        """Dummy function for ``dask.array.from_delayed()``."""
+        pass
+
+    def dask_bag_from_delayed(*args, **kwargs):  # type: ignore
+        """Dummy function for ``dask.bag.from_delayed()``."""
         pass
 
     class dask_DataFrame:  # type: ignore
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index f6ebe57144b2..a2f97213a535 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -17,8 +17,9 @@
 import scipy.sparse as ss
 
 from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call
-from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_sequence,
-                     dask_Array, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait)
+from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_delayed,
+                     dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_DataFrame, dask_Series, default_client,
+                     delayed, pd_DataFrame, pd_Series, wait)
 from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
@@ -509,41 +510,6 @@ def _predict_part(
     return result
 
 
-def _combine_preds(
-    preds_so_far: List[ss.spmatrix],
-    new_chunk: List[ss.spmatrix]
-) -> List[ss.spmatrix]:
-    """Add a new set of pred_contrib predictions to an existing list.
-
-    For multi-class classification models, calling ``.predict(X, pred_contrib=True)``
-    returns a list of sparse matrices (one per class) when ``X`` is a sparse matrix.
-
-    This function is used to combine the results of multiple such predict calls
-    executed in parallel on different partitions of data in a Dask Array.
-
-    Parameters
-    ----------
-    preds_so_far : list of sparse matrices
-        Current list of predictions.
-    new_chunk : list of sparse matrices
-        Predictions from an additional chunk of the data, to be appended to ``preds_so_far``.
-    """
-    for i in range(len(preds_so_far)):
-        preds_so_far[i] = _concat([preds_so_far[i], new_chunk[i]])
-    return preds_so_far
-
-
-def _predict_part_binop(_ignore: None, chunk: dask_Array, predict_function: Callable) -> List[ss.spmatrix]:
-    """Get predictions for one Dask Array.
-
-    This function exists for use in ``dask.bag.Bag.fold()``.
-    The ``_ignore`` argument is used to handle that fact that ``dask.bag.Bag.fold()``
-    starts the folding process from an initial value and expects to be able to pass that
-    value as the first positional argument to the ``"binop"`` function provided.
-    """
-    return predict_function(chunk.compute())
-
-
 def _predict(
     model: LGBMModel,
     data: _DaskMatrixLike,
@@ -553,7 +519,7 @@ def _predict(
     pred_contrib: bool = False,
     dtype: _PredictionDtype = np.float32,
     **kwargs: Any
-) -> dask_Array:
+) -> Union[dask_Array, List[dask_Array]]:
     """Inner predict routine.
 
     Parameters
@@ -581,7 +547,7 @@ def _predict(
         The predicted values.
     X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
         If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
+    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list of Dask Arrays if ``data`` is sparse and ``pred_contrib=True``
         If ``pred_contrib=True``, the feature contributions for each sample.
     """
     if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
@@ -599,12 +565,13 @@ def _predict(
     elif isinstance(data, dask_Array):
         # for multi-class classification with sparse matrices, pred_contrib predictions
         # are returned as a list of sparse matrices (one per class)
+        num_classes = getattr(model, "n_classes_", -1)
+
         if (
-            getattr(model, "n_classes_", -1) > 2
+            num_classes > 2
             and isinstance(data._meta, ss.spmatrix)
             and pred_contrib
         ):
-            bag = dask_bag_from_sequence(data.partitions)
 
             predict_function = partial(
                 _predict_part,
@@ -616,16 +583,41 @@ def _predict(
                 **kwargs
             )
 
-            predict_part_binop = partial(
-                _predict_part_binop,
-                predict_function=predict_function
-            )
+            delayed_chunks = data.to_delayed()
+            bag = dask_bag_from_delayed(delayed_chunks[:, 0])
 
-            return bag.fold(
-                binop=predict_part_binop,
-                combine=_combine_preds,
-                initial=None
-            )
+            @delayed
+            def _extract(items: List[Any], i: int) -> Any:
+                return items[i]
+
+            preds = bag.map_partitions(predict_function)
+
+            # pred_contrib output will have one column per feature,
+            # plus one more for the base value
+            num_cols = model.n_features_ + 1
+
+            nrows_per_chunk = data.chunks[0]
+            out = [list() for _ in range(num_classes)]
+
+            # need to tell Dask the expected type and shape of individual preds
+            pred_meta_shape = (0, num_cols)
+            pred_meta = ss.csr_matrix(pred_meta_shape)
+
+            for j, partition in enumerate(preds.to_delayed()):
+                for i in range(num_classes):
+                    part = dask_array_from_delayed(
+                        value=_extract(partition, i),
+                        shape=(nrows_per_chunk[j], num_cols),
+                        meta=pred_meta
+                    )
+                    out[i].append(part)
+
+            # At this point, `out` is a list of lists of delayeds (each of which points to a matrix).
+            # Concatenate them to return a list of Dask Arrays.
+            for i in range(num_classes):
+                out[i] = dask_array_concatenate(out[i]).map_blocks(ss.csr_matrix)
+
+            return out
 
         return data.map_blocks(
             _predict_part,
@@ -831,9 +823,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         description="Return the predicted value for each sample.",
         X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
         output_name="predicted_result",
-        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes], or (if multiclass and ``pred_contrib=True`` and using sparse inputs) a Dask Bag which evaluates to a list of n_classes sparse matrices",
+        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
         X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
-        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]"
     )
 
     def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index da300ad040a9..55d072052c67 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -13,8 +13,8 @@
 
 import lightgbm as lgb
 
-if not platform.startswith('linux'):
-    pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
+# if not platform.startswith('linux'):
+#     pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
 if not lgb.compat.DASK_INSTALLED:
     pytest.skip('Dask is not installed', allow_module_level=True)
 
@@ -343,7 +343,7 @@ def test_classifier_pred_contrib(output, task, cluster):
             **params
         )
         dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
-        preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute()
+        preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True)
 
         local_classifier = lgb.LGBMClassifier(**params)
         local_classifier.fit(X, y, sample_weight=w)
@@ -364,19 +364,23 @@ def test_classifier_pred_contrib(output, task, cluster):
         # and then return early
         if output == 'scipy_csr_matrix' and task == 'multiclass-classification':
             assert isinstance(preds_with_contrib, list)
+            assert all(isinstance(arr, da.Array) for arr in preds_with_contrib)
+            assert all(isinstance(arr._meta, csr_matrix) for arr in preds_with_contrib)
             assert len(preds_with_contrib) == num_classes
             assert len(preds_with_contrib) == len(local_preds_with_contrib)
             for i in range(num_classes):
-                assert isinstance(preds_with_contrib[i], csr_matrix)
-                assert preds_with_contrib[i].shape[1] == num_classes
-                assert preds_with_contrib[i].shape == local_preds_with_contrib[i].shape
-                assert len(np.unique(preds_with_contrib[i][:, -1]))
+                computed_preds = preds_with_contrib[i].compute()
+                assert isinstance(computed_preds, csr_matrix)
+                assert computed_preds.shape[1] == num_classes
+                assert computed_preds.shape == local_preds_with_contrib[i].shape
+                assert len(np.unique(computed_preds[:, -1])) == 1
                 # raw scores will probably be different, but at least check that all predicted classes are the same
-                pred_classes = np.argmax(np.array(preds_with_contrib[i].todense()), axis=1)
+                pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1)
                 local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
                 assert np.all(pred_classes == local_pred_classes)
             return
 
+        preds_with_contrib = preds_with_contrib.compute()
         if output == 'scipy_csr_matrix':
             preds_with_contrib = np.array(preds_with_contrib.todense())
 

From 8378ceec56431712a608667f3260524781386e91 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 16 Jun 2021 00:26:27 -0500
Subject: [PATCH 10/26] docstring pycodestyle is happy with

---
 python-package/lightgbm/compat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index f299adc1b6d5..51d15a616f44 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -150,15 +150,15 @@ class dask_Array:  # type: ignore
         pass
 
     def dask_array_concatenate(*args, **kwargs):  # type: ignore
-        """Dummy function for ``dask.array.concatenate()``."""
+        """Mock ``dask.array.concatenate()``."""
         pass
 
     def dask_array_from_delayed(*args, **kwargs):  # type: ignore
-        """Dummy function for ``dask.array.from_delayed()``."""
+        """Mock function for ``dask.array.from_delayed()``."""
         pass
 
     def dask_bag_from_delayed(*args, **kwargs):  # type: ignore
-        """Dummy function for ``dask.bag.from_delayed()``."""
+        """Mock function for ``dask.bag.from_delayed()``."""
         pass
 
     class dask_DataFrame:  # type: ignore

From f86220d433541d3b05e725eec36913118e9fdc49 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 16 Jun 2021 00:31:59 -0500
Subject: [PATCH 11/26] isort

---
 python-package/lightgbm/dask.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index a2f97213a535..cd781822e3ad 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -17,9 +17,9 @@
 import scipy.sparse as ss
 
 from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call
-from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_bag_from_delayed,
-                     dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_DataFrame, dask_Series, default_client,
-                     delayed, pd_DataFrame, pd_Series, wait)
+from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat,
+                     dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame,
+                     dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait)
 from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]

From a6557a8e5182cabe4746deeb5f697d351d8fdf58 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 16 Jun 2021 16:48:41 -0500
Subject: [PATCH 12/26] put pytest skips back

---
 tests/python_package_test/test_dask.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 55d072052c67..e8f17d1411be 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -13,8 +13,8 @@
 
 import lightgbm as lgb
 
-# if not platform.startswith('linux'):
-#     pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
+if not platform.startswith('linux'):
+    pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
 if not lgb.compat.DASK_INSTALLED:
     pytest.skip('Dask is not installed', allow_module_level=True)
 

From 45ebf7da72ce47ff9e3170f1594641946c8f3e68 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 22 Jun 2021 17:11:33 -0500
Subject: [PATCH 13/26] respect sparse return type

---
 python-package/lightgbm/dask.py        | 19 ++++++++++++++++---
 tests/python_package_test/test_dask.py | 23 +++++++++++++++++------
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 5e1ab414fd28..88cefb5ddfb5 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -601,8 +601,7 @@ def _extract(items: List[Any], i: int) -> Any:
             out = [list() for _ in range(num_classes)]
 
             # need to tell Dask the expected type and shape of individual preds
-            pred_meta_shape = (0, num_cols)
-            pred_meta = ss.csr_matrix(pred_meta_shape)
+            pred_meta = data._meta
 
             for j, partition in enumerate(preds.to_delayed()):
                 for i in range(num_classes):
@@ -613,10 +612,24 @@ def _extract(items: List[Any], i: int) -> Any:
                     )
                     out[i].append(part)
 
+            # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix
+            #
+            # the code below is used instead to ensure that the sparse type is preserved during concatentation
+            if isinstance(pred_meta, ss.csr_matrix):
+                concat_fn = partial(ss.vstack, format='csr')
+            elif isinstance(pred_meta, ss.csc_matrix):
+                concat_fn = partial(ss.vstack, format='csc')
+            else:
+                concat_fn = ss.vstack
+
             # At this point, `out` is a list of lists of delayeds (each of which points to a matrix).
             # Concatenate them to return a list of Dask Arrays.
             for i in range(num_classes):
-                out[i] = dask_array_concatenate(out[i]).map_blocks(ss.csr_matrix)
+                out[i] = dask_array_from_delayed(
+                    value=delayed(concat_fn)(out[i]),
+                    shape=(data.shape[0], num_cols),
+                    meta=pred_meta
+                )
 
             return out
 
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index e8f17d1411be..7b32d275b7e2 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -28,7 +28,7 @@
 from dask.array.utils import assert_eq
 from dask.distributed import Client, LocalCluster, default_client, wait
 from pkg_resources import parse_version
-from scipy.sparse import csr_matrix
+from scipy.sparse import csc_matrix, csr_matrix
 from scipy.stats import spearmanr
 from sklearn import __version__ as sk_version
 from sklearn.datasets import make_blobs, make_regression
@@ -199,6 +199,11 @@ def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **k
         dy = da.from_array(y, chunks=chunk_size)
         dw = da.from_array(weights, chunk_size)
         X = csr_matrix(X)
+    elif output == 'scipy_csc_matrix':
+        dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)
+        dy = da.from_array(y, chunks=chunk_size)
+        dw = da.from_array(weights, chunk_size)
+        X = csc_matrix(X)
     else:
         raise ValueError(f"Unknown output type '{output}'")
 
@@ -322,7 +327,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
 
-@pytest.mark.parametrize('output', data_output)
+@pytest.mark.parametrize('output', data_output + ['scipy_csc_matrix'])
 @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
 def test_classifier_pred_contrib(output, task, cluster):
     with Client(cluster) as client:
@@ -362,15 +367,21 @@ def test_classifier_pred_contrib(output, task, cluster):
         #
         # since that case is so different than all other cases, check the relevant things here
         # and then return early
-        if output == 'scipy_csr_matrix' and task == 'multiclass-classification':
+        if output.startswith('scipy') and task == 'multiclass-classification':
+            if output == 'scipy_csr_matrix':
+                expected_type = csr_matrix
+            elif output == 'scipy_csc_matrix':
+                expected_type = csc_matrix
+            else:
+                raise ValueError(f"Unrecognized output type: {output}")
             assert isinstance(preds_with_contrib, list)
             assert all(isinstance(arr, da.Array) for arr in preds_with_contrib)
-            assert all(isinstance(arr._meta, csr_matrix) for arr in preds_with_contrib)
+            assert all(isinstance(arr._meta, expected_type) for arr in preds_with_contrib)
             assert len(preds_with_contrib) == num_classes
             assert len(preds_with_contrib) == len(local_preds_with_contrib)
             for i in range(num_classes):
                 computed_preds = preds_with_contrib[i].compute()
-                assert isinstance(computed_preds, csr_matrix)
+                assert isinstance(computed_preds, expected_type)
                 assert computed_preds.shape[1] == num_classes
                 assert computed_preds.shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(computed_preds[:, -1])) == 1
@@ -381,7 +392,7 @@ def test_classifier_pred_contrib(output, task, cluster):
             return
 
         preds_with_contrib = preds_with_contrib.compute()
-        if output == 'scipy_csr_matrix':
+        if output.startswith('scipy'):
             preds_with_contrib = np.array(preds_with_contrib.todense())
 
         # be sure LightGBM actually used at least one categorical column,

From 585fef4117fa8c773f6f584f90c831a999f49321 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 22 Jun 2021 17:24:20 -0500
Subject: [PATCH 14/26] fix doc

---
 python-package/lightgbm/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 88cefb5ddfb5..cdab892128a6 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -548,7 +548,7 @@ def _predict(
         The predicted values.
     X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
         If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list of Dask Arrays if ``data`` is sparse and ``pred_contrib=True``
+    X_SHAP_values : "Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]"
         If ``pred_contrib=True``, the feature contributions for each sample.
     """
     if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):

From c5a0483c29bc7718bc0cefbfdd3337bb118625f0 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 22 Jun 2021 22:29:31 -0500
Subject: [PATCH 15/26] remove unnecessary dask_array_concatenate()

---
 python-package/lightgbm/compat.py | 5 -----
 python-package/lightgbm/dask.py   | 4 ++--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 51d15a616f44..a596c1f03f16 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -125,7 +125,6 @@ class _LGBMRegressorBase:  # type: ignore
 try:
     from dask import delayed
     from dask.array import Array as dask_Array
-    from dask.array import concatenate as dask_array_concatenate
     from dask.array import from_delayed as dask_array_from_delayed
     from dask.bag import from_delayed as dask_bag_from_delayed
     from dask.dataframe import DataFrame as dask_DataFrame
@@ -149,10 +148,6 @@ class dask_Array:  # type: ignore
 
         pass
 
-    def dask_array_concatenate(*args, **kwargs):  # type: ignore
-        """Mock ``dask.array.concatenate()``."""
-        pass
-
     def dask_array_from_delayed(*args, **kwargs):  # type: ignore
         """Mock function for ``dask.array.from_delayed()``."""
         pass
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index cdab892128a6..eef3aaebd5c2 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -18,8 +18,8 @@
 
 from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call
 from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat,
-                     dask_Array, dask_array_concatenate, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame,
-                     dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait)
+                     dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
+                     default_client, delayed, pd_DataFrame, pd_Series, wait)
 from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]

From 336195045c66f0a22954af25180ccef41eaae5b6 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 3 Jul 2021 23:06:53 -0500
Subject: [PATCH 16/26] Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/compat.py      | 4 ++--
 python-package/lightgbm/dask.py        | 5 ++---
 tests/python_package_test/test_dask.py | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index a596c1f03f16..669969b79825 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -149,11 +149,11 @@ class dask_Array:  # type: ignore
         pass
 
     def dask_array_from_delayed(*args, **kwargs):  # type: ignore
-        """Mock function for ``dask.array.from_delayed()``."""
+        """Mock function for dask.array.from_delayed()."""
         pass
 
     def dask_bag_from_delayed(*args, **kwargs):  # type: ignore
-        """Mock function for ``dask.bag.from_delayed()``."""
+        """Mock function for dask.bag.from_delayed()."""
         pass
 
     class dask_DataFrame:  # type: ignore
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 2904bf2851a2..c3da0ea35c8d 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -890,12 +890,12 @@ def _predict(
     elif isinstance(data, dask_Array):
         # for multi-class classification with sparse matrices, pred_contrib predictions
         # are returned as a list of sparse matrices (one per class)
-        num_classes = getattr(model, "n_classes_", -1)
+        num_classes = model._n_classes or -1
 
         if (
             num_classes > 2
-            and isinstance(data._meta, ss.spmatrix)
             and pred_contrib
+            and isinstance(data._meta, ss.spmatrix)
         ):
 
             predict_function = partial(
@@ -937,7 +937,6 @@ def _extract(items: List[Any], i: int) -> Any:
                     out[i].append(part)
 
             # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix
-            #
             # the code below is used instead to ensure that the sparse type is preserved during concatentation
             if isinstance(pred_meta, ss.csr_matrix):
                 concat_fn = partial(ss.vstack, format='csr')
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index e9f89f1750d0..6b70c81620e3 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -411,7 +411,7 @@ def test_classifier_pred_contrib(output, task, cluster):
                 # raw scores will probably be different, but at least check that all predicted classes are the same
                 pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1)
                 local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
-                assert np.all(pred_classes == local_pred_classes)
+                np.testing.assert_array_equal(pred_classes, local_pred_classes)
             return
 
         preds_with_contrib = preds_with_contrib.compute()

From 77661210d45930b77e0e7d8da46908edd06a5c2e Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 11:42:51 -0500
Subject: [PATCH 17/26] Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/dask.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index c3da0ea35c8d..3a42d4a4b95d 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -872,7 +872,7 @@ def _predict(
         The predicted values.
     X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
         If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : "Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]"
+    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]
         If ``pred_contrib=True``, the feature contributions for each sample.
     """
     if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
@@ -922,7 +922,7 @@ def _extract(items: List[Any], i: int) -> Any:
             num_cols = model.n_features_ + 1
 
             nrows_per_chunk = data.chunks[0]
-            out = [list() for _ in range(num_classes)]
+            out = [[] for _ in range(num_classes)]
 
             # need to tell Dask the expected type and shape of individual preds
             pred_meta = data._meta
@@ -1210,7 +1210,7 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         output_name="predicted_result",
         predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
         X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
-        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multiclass and using sparse inputs) a list of ``n_classes`` Dask Arrays which each have shape [n_samples, n_features + 1]"
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]"
     )
 
     def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:

From 810a0b8e11b9ebe2717d9146a29a5be8aa57721c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 11:44:35 -0500
Subject: [PATCH 18/26] update predict_proba() docstring

---
 python-package/lightgbm/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 1ce5b5c64164..40ede4980dc1 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -1228,7 +1228,7 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         output_name="predicted_probability",
         predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
         X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
-        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]"
     )
 
     def to_local(self) -> LGBMClassifier:

From f79ab3c51cea475f97b257855aa8d9bf057685a9 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 11:52:46 -0500
Subject: [PATCH 19/26] remove unnecessary np.array()

---
 tests/python_package_test/test_dask.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index f3a6e27f6afc..21fb445a872c 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster):
                 assert computed_preds.shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(computed_preds[:, -1])) == 1
                 # raw scores will probably be different, but at least check that all predicted classes are the same
-                pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1)
-                local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
+                pred_classes = np.argmax(computed_preds.todense(), axis=1)
+                local_pred_classes = np.argmax(local_preds_with_contrib[i].todense(), axis=1)
                 np.testing.assert_array_equal(pred_classes, local_pred_classes)
             return
 
         preds_with_contrib = preds_with_contrib.compute()
         if output.startswith('scipy'):
-            preds_with_contrib = np.array(preds_with_contrib.todense())
+            preds_with_contrib = preds_with_contrib.todense()
 
         # be sure LightGBM actually used at least one categorical column,
         # and that it was correctly treated as a categorical feature
@@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster):
         local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)
 
         if output == "scipy_csr_matrix":
-            preds_with_contrib = np.array(preds_with_contrib.todense())
+            preds_with_contrib = preds_with_contrib.todense()
 
         # contrib outputs for distributed training are different than from local training, so we can just test
         # that the output has the right shape and base values are in the right position

From 6a7da56df4b770dbe487171cb519e305e214df83 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 11:54:40 -0500
Subject: [PATCH 20/26] Update python-package/lightgbm/dask.py

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 40ede4980dc1..83c9280c4ca7 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -902,7 +902,7 @@ def _predict(
                 _predict_part,
                 model=model,
                 raw_score=False,
-                pred_proba=False,
+                pred_proba=pred_proba,
                 pred_leaf=False,
                 pred_contrib=True,
                 **kwargs

From 95ae45f879942191f94e1fceaab325c6bf737e21 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 15:18:15 -0500
Subject: [PATCH 21/26] fix assertion

---
 tests/python_package_test/test_dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 21fb445a872c..68117aac5cb4 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster):
         assert preds_with_contrib.shape == local_preds_with_contrib.shape
 
         if num_classes == 2:
-            assert len(np.unique(preds_with_contrib[:, num_features]) == 1)
+            assert len(np.unique(preds_with_contrib[:, num_features])) == 1
         else:
             for i in range(num_classes):
                 base_value_col = num_features * (i + 1) + i

From a7d6d374ee58e7f7f2f63ba638e0a2f80668d4a7 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 17:03:15 -0500
Subject: [PATCH 22/26] fix test use of len()

---
 tests/python_package_test/test_dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 68117aac5cb4..455cbb8bbb92 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster):
         assert preds_with_contrib.shape == local_preds_with_contrib.shape
 
         if num_classes == 2:
-            assert len(np.unique(preds_with_contrib[:, num_features])) == 1
+            assert np.unique(preds_with_contrib[:, num_features]).shape[0] == 1
         else:
             for i in range(num_classes):
                 base_value_col = num_features * (i + 1) + i

From 831927bb9fb501a71d5a93558c6a7c2dda20770f Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 4 Jul 2021 23:37:46 -0500
Subject: [PATCH 23/26] restore np.array() in tests

---
 tests/python_package_test/test_dask.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 455cbb8bbb92..28905fa684f8 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster):
                 assert computed_preds.shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(computed_preds[:, -1])) == 1
                 # raw scores will probably be different, but at least check that all predicted classes are the same
-                pred_classes = np.argmax(computed_preds.todense(), axis=1)
-                local_pred_classes = np.argmax(local_preds_with_contrib[i].todense(), axis=1)
+                pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1)
+                local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
                 np.testing.assert_array_equal(pred_classes, local_pred_classes)
             return
 
         preds_with_contrib = preds_with_contrib.compute()
         if output.startswith('scipy'):
-            preds_with_contrib = preds_with_contrib.todense()
+            preds_with_contrib = np.array(preds_with_contrib.todense())
 
         # be sure LightGBM actually used at least one categorical column,
         # and that it was correctly treated as a categorical feature
@@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster):
         local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)
 
         if output == "scipy_csr_matrix":
-            preds_with_contrib = preds_with_contrib.todense()
+            preds_with_contrib = np.array(preds_with_contrib.todense())
 
         # contrib outputs for distributed training are different than from local training, so we can just test
         # that the output has the right shape and base values are in the right position

From 61f31dd0d79ac950379fb1b52e45afbbdd43e65e Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 5 Jul 2021 00:01:35 -0500
Subject: [PATCH 24/26] use np.asarray() instead

---
 tests/python_package_test/test_dask.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 28905fa684f8..186882a22117 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster):
                 assert computed_preds.shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(computed_preds[:, -1])) == 1
                 # raw scores will probably be different, but at least check that all predicted classes are the same
-                pred_classes = np.argmax(np.array(computed_preds.todense()), axis=1)
-                local_pred_classes = np.argmax(np.array(local_preds_with_contrib[i].todense()), axis=1)
+                pred_classes = np.argmax(np.asarray(computed_preds.todense()), axis=1)
+                local_pred_classes = np.argmax(np.asarray(local_preds_with_contrib[i].todense()), axis=1)
                 np.testing.assert_array_equal(pred_classes, local_pred_classes)
             return
 
         preds_with_contrib = preds_with_contrib.compute()
         if output.startswith('scipy'):
-            preds_with_contrib = np.array(preds_with_contrib.todense())
+            preds_with_contrib = np.asarray(preds_with_contrib.todense())
 
         # be sure LightGBM actually used at least one categorical column,
         # and that it was correctly treated as a categorical feature
@@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster):
         local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)
 
         if output == "scipy_csr_matrix":
-            preds_with_contrib = np.array(preds_with_contrib.todense())
+            preds_with_contrib = np.asarray(preds_with_contrib.todense())
 
         # contrib outputs for distributed training are different than from local training, so we can just test
         # that the output has the right shape and base values are in the right position

From 57edeaa8c95648d990b8aa2e20fc43d3055f7e8e Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 5 Jul 2021 18:54:48 -0500
Subject: [PATCH 25/26] use toarray()

---
 tests/python_package_test/test_dask.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 186882a22117..2c0d4089c990 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -409,14 +409,14 @@ def test_classifier_pred_contrib(output, task, cluster):
                 assert computed_preds.shape == local_preds_with_contrib[i].shape
                 assert len(np.unique(computed_preds[:, -1])) == 1
                 # raw scores will probably be different, but at least check that all predicted classes are the same
-                pred_classes = np.argmax(np.asarray(computed_preds.todense()), axis=1)
-                local_pred_classes = np.argmax(np.asarray(local_preds_with_contrib[i].todense()), axis=1)
+                pred_classes = np.argmax(computed_preds.toarray(), axis=1)
+                local_pred_classes = np.argmax(local_preds_with_contrib[i].toarray(), axis=1)
                 np.testing.assert_array_equal(pred_classes, local_pred_classes)
             return
 
         preds_with_contrib = preds_with_contrib.compute()
         if output.startswith('scipy'):
-            preds_with_contrib = np.asarray(preds_with_contrib.todense())
+            preds_with_contrib = preds_with_contrib.toarray()
 
         # be sure LightGBM actually used at least one categorical column,
         # and that it was correctly treated as a categorical feature
@@ -439,7 +439,7 @@ def test_classifier_pred_contrib(output, task, cluster):
         assert preds_with_contrib.shape == local_preds_with_contrib.shape
 
         if num_classes == 2:
-            assert np.unique(preds_with_contrib[:, num_features]).shape[0] == 1
+            assert len(np.unique(preds_with_contrib[:, num_features])) == 1
         else:
             for i in range(num_classes):
                 base_value_col = num_features * (i + 1) + i
@@ -621,7 +621,7 @@ def test_regressor_pred_contrib(output, cluster):
         local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)
 
         if output == "scipy_csr_matrix":
-            preds_with_contrib = np.asarray(preds_with_contrib.todense())
+            preds_with_contrib = preds_with_contrib.toarray()
 
         # contrib outputs for distributed training are different than from local training, so we can just test
         # that the output has the right shape and base values are in the right position

From e4fed6f1300983993babe32111325ca98c448e5d Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 6 Jul 2021 17:32:09 -0500
Subject: [PATCH 26/26] remove empty functions in compat

---
 python-package/lightgbm/compat.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 669969b79825..52726622f076 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -134,6 +134,8 @@ class _LGBMRegressorBase:  # type: ignore
 except ImportError:
     DASK_INSTALLED = False
 
+    dask_array_from_delayed = None
+    dask_bag_from_delayed = None
     delayed = None
     default_client = None
     wait = None
@@ -148,14 +150,6 @@ class dask_Array:  # type: ignore
 
         pass
 
-    def dask_array_from_delayed(*args, **kwargs):  # type: ignore
-        """Mock function for dask.array.from_delayed()."""
-        pass
-
-    def dask_bag_from_delayed(*args, **kwargs):  # type: ignore
-        """Mock function for dask.bag.from_delayed()."""
-        pass
-
     class dask_DataFrame:  # type: ignore
         """Dummy class for dask.dataframe.DataFrame."""