dask · jacobtomlinson · Mar 14, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml
@@ -15,7 +15,7 @@ dependencies:
   - numba
   - numpy
   - psutil
-  - python=3.10
+  - python=3.11
   - sortedcontainers
   - scikit-learn >=1.2.0
   - scipy

diff --git a/ci/environment-latest.yaml b/ci/environment-latest.yaml
@@ -1 +1 @@
-environment-3.10.yaml
+ci/environment-3.11.yaml
diff --git a/dask_ml/compose/__init__.py b/dask_ml/compose/__init__.py
@@ -2,6 +2,7 @@
 
 These estimators are useful for working with heterogenous tabular data.
 """
+
 from ._column_transformer import ColumnTransformer, make_column_transformer
 
 __all__ = ["ColumnTransformer", "make_column_transformer"]
diff --git a/dask_ml/decomposition/extmath.py b/dask_ml/decomposition/extmath.py
@@ -1,6 +1,7 @@
 """
 Extended math utilities.
 """
+
 # Authors: Gael Varoquaux
 #          Alexandre Gramfort
 #          Alexandre T. Passos

diff --git a/dask_ml/feature_extraction/__init__.py b/dask_ml/feature_extraction/__init__.py
@@ -1,6 +1,7 @@
 """
 Utilities for extracting features from data.
 """
+
 from . import text
 
 __all__ = ["text"]
diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py
@@ -1,6 +1,7 @@
 """
 Utilities to build feature vectors from text documents.
 """
+
 import itertools
 
 import dask

diff --git a/dask_ml/linear_model/__init__.py b/dask_ml/linear_model/__init__.py
@@ -1,6 +1,7 @@
 """The ``dask_ml.linear_model`` module implements linear models for
 classification and regression.
 """
+
 from .glm import LinearRegression, LogisticRegression, PoissonRegression
 
 __all__ = [

diff --git a/dask_ml/linear_model/utils.py b/dask_ml/linear_model/utils.py
@@ -1,5 +1,6 @@
 """
 """
+
 import dask.array as da
 import dask.dataframe as dd
 import numpy as np

diff --git a/dask_ml/metrics/pairwise.py b/dask_ml/metrics/pairwise.py
@@ -1,6 +1,7 @@
 """
 Daskified versions of sklearn.metrics.pairwise
 """
+
 import warnings
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
@@ -188,7 +189,7 @@ def sigmoid_kernel(
     "rbf": rbf_kernel,
     "linear": linear_kernel,
     "polynomial": polynomial_kernel,
-    "sigmoid": sigmoid_kernel
+    "sigmoid": sigmoid_kernel,
     # TODO:
     # - cosine_similarity
     # - laplacian

diff --git a/dask_ml/model_selection/__init__.py b/dask_ml/model_selection/__init__.py
@@ -3,6 +3,7 @@
 These estimators will operate in parallel. Their scalability depends
 on the underlying estimators being used.
 """
+
 from ._hyperband import HyperbandSearchCV
 from ._incremental import IncrementalSearchCV, InverseDecaySearchCV
 from ._search import GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits

diff --git a/dask_ml/model_selection/_split.py b/dask_ml/model_selection/_split.py
@@ -1,5 +1,6 @@
 """Utilities for splitting datasets.
 """
+
 import itertools
 import logging
 import numbers
@@ -436,9 +437,11 @@ def train_test_split(
     if da.Array in types and types & {dd.Series, dd.DataFrame}:
         if convert_mixed_types:
             arrays = tuple(
-                x.to_dask_array(lengths=True)
-                if isinstance(x, (dd.Series, dd.DataFrame))
-                else x
+                (
+                    x.to_dask_array(lengths=True)
+                    if isinstance(x, (dd.Series, dd.DataFrame))
+                    else x
+                )
                 for x in arrays
             )
         else:

diff --git a/dask_ml/preprocessing/__init__.py b/dask_ml/preprocessing/__init__.py
@@ -1,5 +1,6 @@
 """Utilties for Preprocessing data.
 """
+
 from ._block_transformer import BlockTransformer
 from ._encoders import OneHotEncoder
 from .data import (

diff --git a/dask_ml/wrappers.py b/dask_ml/wrappers.py
@@ -1,4 +1,5 @@
 """Meta-estimators for parallelizing estimators using the scikit-learn API."""
+
 import logging
 import warnings
 

diff --git a/dask_ml/xgboost.py b/dask_ml/xgboost.py
@@ -4,4 +4,5 @@
 will be setup in distributed mode alongside your existing
 ``dask.distributed`` cluster.
 """
+
 from dask_xgboost import *  # noqa
diff --git a/tests/compose/test_column_transformer.py b/tests/compose/test_column_transformer.py
@@ -17,7 +17,8 @@
 
 
 @pytest.mark.skip(
-    reason="ValueError: Specifying the columns using strings is only supported for dataframes."
+    reason="ValueError: Specifying the columns using strings is only "
+    + "supported for dataframes."
 )
 def test_column_transformer():
     # Ordering of make_column_transformer was changed from
@@ -61,7 +62,8 @@ def test_column_transformer():
 
 
 @pytest.mark.skip(
-    reason="ValueError: Specifying the columns using strings is only supported for dataframes."
+    reason="ValueError: Specifying the columns using strings is only "
+    + "supported for dataframes."
 )
 def test_column_transformer_unk_chunksize():
     names = ["a", "b", "c"]
@@ -93,7 +95,8 @@ def test_column_transformer_unk_chunksize():
 
 
 @pytest.mark.skip(
-    reason="ValueError: Specifying the columns using strings is only supported for dataframes."
+    reason="ValueError: Specifying the columns using strings is only "
+    + "supported for dataframes."
 )
 def test_sklearn_col_trans_disallows_hstack_then_block():
     # Test that sklearn ColumnTransformer (to which dask-ml ColumnTransformer

diff --git a/tests/linear_model/test_glm.py b/tests/linear_model/test_glm.py
@@ -236,7 +236,8 @@ def test_model_coef_dask_numpy(est, data, request):
 
 
 # fmt: off
-@pytest.mark.skip(reason="AssertionError: Not equal to tolerance rtol=0.001, atol=0.0002")
+@pytest.mark.skip(
+    reason="AssertionError: Not equal to tolerance rtol=0.001, atol=0.0002")
 @pytest.mark.parametrize("solver", ["newton", "lbfgs"])
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -149,7 +149,8 @@ def test_log_loss_shape(yhat):
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The `needs_threshold` and `needs_proba` parameter are deprecated"
+    reason="FutureWarning: The `needs_threshold` and `needs_proba` "
+    + "parameter are deprecated"
 )
 @pytest.mark.parametrize("y", [[0, 1, 1, 0], [0, 1, 2, 0]])
 def test_log_loss_scoring(y):

diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py
@@ -65,7 +65,8 @@ def test_mse_squared(squared):
 
 
 @pytest.mark.skip(
-    reason="InvalidParameterError: The 'multioutput' parameter of mean_squared_error must be a string among..."
+    reason="InvalidParameterError: The 'multioutput' parameter of mean_squared_error "
+    + "must be a string among..."
 )
 @pytest.mark.parametrize("multioutput", ["uniform_average", None])
 def test_regression_metrics_unweighted_average_multioutput(metric_pairs, multioutput):

diff --git a/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py b/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py
@@ -120,7 +120,8 @@ def test_hyperparameter_searcher_with_fit_params(cls, kwargs):
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 def test_grid_search_no_score():
@@ -156,7 +157,8 @@ def test_grid_search_no_score():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 def test_grid_search_score_method():
     X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
@@ -187,7 +189,8 @@ def test_grid_search_score_method():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 def test_grid_search_groups():
     # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV
@@ -272,7 +275,8 @@ def test_return_train_score_warn():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 def test_classes__property():
@@ -415,7 +419,8 @@ def test_grid_search_bad_param_grid():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 def test_grid_search_sparse():
     # Test that grid search works with both dense and sparse matrices
@@ -439,7 +444,8 @@ def test_grid_search_sparse():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 def test_grid_search_sparse_scoring():
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
@@ -1010,7 +1016,8 @@ def test_search_cv_results_none_param():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 def test_grid_search_correct_score_results():
@@ -1221,7 +1228,8 @@ def test_grid_search_failing_classifier_raise():
 
 
 @pytest.mark.skip(
-    reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`"
+    reason="FutureWarning: The default value of `dual` will change from "
+    + "`True` to `'auto'`"
 )
 def test_search_train_scores_set_to_false():
     X = np.arange(6).reshape(6, -1)

diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
@@ -45,7 +45,8 @@
 
 
 @pytest.mark.skip(
-    reason="yaml.representer.RepresenterError: ('cannot represent an object', 0.05006666249977221)"
+    reason="yaml.representer.RepresenterError: "
+    + "('cannot represent an object', 0.05006666249977221)"
 )
 @gen_cluster(client=True, timeout=1000)
 async def test_basic(c, s, a, b):

diff --git a/tests/model_selection/test_split.py b/tests/model_selection/test_split.py
@@ -14,7 +14,8 @@
 
 
 @pytest.mark.skip(
-    reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must be an instance of 'bool' or an instance of 'numpy.bool_'"
+    reason="InvalidParameterError: The 'shuffle' parameter of train_test_split "
+    + "must be an instance of 'bool' or an instance of 'numpy.bool_'"
 )
 def test_20_newsgroups():
     data = fetch_20newsgroups()
@@ -246,7 +247,8 @@ def test_split_mixed():
 
 
 @pytest.mark.skip(
-    reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must be an instance of 'bool' or an instance of 'numpy.bool_'"
+    reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must "
+    + "be an instance of 'bool' or an instance of 'numpy.bool_'"
 )
 def test_split_3d_data():
     X_3d = np.arange(1.0, 5001.0).reshape((100, 10, 5))

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
@@ -128,7 +128,8 @@ def test_inverse_transform(self):
         assert_eq_ar(result, X)
 
     @pytest.mark.skip(
-        reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword argument 'columns'"
+        reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword "
+        + "argument 'columns'"
     )
     @pytest.mark.xfail(reason="removed columns")
     def test_df_inverse_transform(self):
@@ -139,7 +140,8 @@ def test_df_inverse_transform(self):
         assert_eq_df(result, df2)
 
     @pytest.mark.skip(
-        reason="AssertionError: found values in 'a' and 'b' which differ by more than the allowed amount"
+        reason="AssertionError: found values in 'a' and 'b' which differ by more "
+        + "than the allowed amount"
     )
     def test_df_values(self):
         est1 = dpp.MinMaxScaler()
@@ -158,7 +160,8 @@ def test_df_values(self):
         assert_eq_ar(result_ar, result_df)
 
     @pytest.mark.skip(
-        reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword argument 'columns'"
+        reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword "
+        + "argument 'columns'"
     )
     @pytest.mark.xfail(reason="removed columns")
     def test_df_column_slice(self):
@@ -211,7 +214,8 @@ def test_inverse_transform(self):
         assert_eq_ar(result, X)
 
     @pytest.mark.skip(
-        reason="DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`"
+        reason="DeprecationWarning: np.find_common_type is deprecated.  Please use "
+        + "`np.result_type` or `np.promote_types`"
     )
     def test_df_values(self):
         est1 = dpp.RobustScaler()
@@ -354,7 +358,8 @@ def test_raises(self):
 
 class TestDummyEncoder:
     @pytest.mark.skip(
-        reason='AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="A_a") are different'
+        reason="AssertionError: Attributes of "
+        + 'DataFrame.iloc[:, 1] (column name="A_a") are different'
     )
     @pytest.mark.parametrize("daskify", [False, True])
     @pytest.mark.parametrize("values", [True, False])
@@ -634,7 +639,8 @@ def test_transformer_params(self):
 
     mark = pytest.mark.xfail(
         DASK_EXPR_ENABLED,
-        reason="dask-expr: NotImplementedError in assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)",
+        reason="dask-expr: NotImplementedError in "
+        + "assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)",
     )
 
     @pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False])

diff --git a/tests/test_incremental_pca.py b/tests/test_incremental_pca.py
@@ -1,4 +1,5 @@
 """Tests for Incremental PCA."""
+
 import numpy as np
 import pytest
 from dask import array as da
@@ -143,7 +144,8 @@ def test_incremental_pca_inverse():
 
 @pytest.mark.skip(reason="AssertionError: Regex pattern did not match.")
 @pytest.mark.skip(
-    reason="InvalidParameterError: The 'min_batch_size' parameter of gen_batches must be an int in the range [0, inf). Got -1 instead."
+    reason="InvalidParameterError: The 'min_batch_size' parameter of "
+    + "gen_batches must be an int in the range [0, inf). Got -1 instead."
 )
 def test_incremental_pca_validation():
     # Test that n_components is >=1 and <= n_features.