From 17ac72cc7d903c77329fef2f685adb84b1fd6df8 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Tue, 12 Mar 2024 12:22:19 -0400 Subject: [PATCH 1/2] updates to envs --- ci/environment-docs.yaml | 2 +- ci/environment-latest.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml index 491f05757..c0410e747 100644 --- a/ci/environment-docs.yaml +++ b/ci/environment-docs.yaml @@ -15,7 +15,7 @@ dependencies: - numba - numpy - psutil - - python=3.10 + - python=3.11 - sortedcontainers - scikit-learn >=1.2.0 - scipy diff --git a/ci/environment-latest.yaml b/ci/environment-latest.yaml index bcb087fd3..64f4c2f18 120000 --- a/ci/environment-latest.yaml +++ b/ci/environment-latest.yaml @@ -1 +1 @@ -environment-3.10.yaml \ No newline at end of file +ci/environment-3.11.yaml \ No newline at end of file From db0934ffebee39d38bb65e5aa393e43ee43662bb Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Tue, 12 Mar 2024 12:22:32 -0400 Subject: [PATCH 2/2] linting fixes --- dask_ml/compose/__init__.py | 1 + dask_ml/decomposition/extmath.py | 1 + dask_ml/feature_extraction/__init__.py | 1 + dask_ml/feature_extraction/text.py | 1 + dask_ml/linear_model/__init__.py | 1 + dask_ml/linear_model/utils.py | 1 + dask_ml/metrics/pairwise.py | 3 ++- dask_ml/model_selection/__init__.py | 1 + dask_ml/model_selection/_split.py | 9 ++++--- dask_ml/preprocessing/__init__.py | 1 + dask_ml/wrappers.py | 1 + dask_ml/xgboost.py | 1 + tests/compose/test_column_transformer.py | 9 ++++--- tests/linear_model/test_glm.py | 3 ++- tests/metrics/test_metrics.py | 3 ++- tests/metrics/test_regression.py | 3 ++- .../test_model_selection_sklearn.py | 24 ++++++++++++------- tests/model_selection/test_incremental.py | 3 ++- tests/model_selection/test_split.py | 6 +++-- tests/preprocessing/test_data.py | 18 +++++++++----- tests/test_incremental_pca.py | 4 +++- tests/test_kmeans.py | 10 +++++--- tests/test_normalize.py | 3 ++- tests/test_pca.py | 6 +++-- tests/test_svd.py | 1 + 25 files changed, 81 insertions(+), 34 deletions(-) diff --git a/dask_ml/compose/__init__.py b/dask_ml/compose/__init__.py index 016d2fe1a..984a4e8b5 100644 --- a/dask_ml/compose/__init__.py +++ b/dask_ml/compose/__init__.py @@ -2,6 +2,7 @@ These estimators are useful for working with heterogenous tabular data. """ + from ._column_transformer import ColumnTransformer, make_column_transformer __all__ = ["ColumnTransformer", "make_column_transformer"] diff --git a/dask_ml/decomposition/extmath.py b/dask_ml/decomposition/extmath.py index 623659e85..c91746185 100644 --- a/dask_ml/decomposition/extmath.py +++ b/dask_ml/decomposition/extmath.py @@ -1,6 +1,7 @@ """ Extended math utilities. """ + # Authors: Gael Varoquaux # Alexandre Gramfort # Alexandre T. Passos diff --git a/dask_ml/feature_extraction/__init__.py b/dask_ml/feature_extraction/__init__.py index bbfbdafc0..21fa63d04 100644 --- a/dask_ml/feature_extraction/__init__.py +++ b/dask_ml/feature_extraction/__init__.py @@ -1,6 +1,7 @@ """ Utilities for extracting features from data. """ + from . import text __all__ = ["text"] diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py index a647ddee7..9710d03a6 100644 --- a/dask_ml/feature_extraction/text.py +++ b/dask_ml/feature_extraction/text.py @@ -1,6 +1,7 @@ """ Utilities to build feature vectors from text documents. """ + import itertools import dask diff --git a/dask_ml/linear_model/__init__.py b/dask_ml/linear_model/__init__.py index d65118bda..de64b9d29 100644 --- a/dask_ml/linear_model/__init__.py +++ b/dask_ml/linear_model/__init__.py @@ -1,6 +1,7 @@ """The ``dask_ml.linear_model`` module implements linear models for classification and regression. """ + from .glm import LinearRegression, LogisticRegression, PoissonRegression __all__ = [ diff --git a/dask_ml/linear_model/utils.py b/dask_ml/linear_model/utils.py index 83b32b96a..761841468 100644 --- a/dask_ml/linear_model/utils.py +++ b/dask_ml/linear_model/utils.py @@ -1,5 +1,6 @@ """ """ + import dask.array as da import dask.dataframe as dd import numpy as np diff --git a/dask_ml/metrics/pairwise.py b/dask_ml/metrics/pairwise.py index ddbc3f877..6b39a254c 100644 --- a/dask_ml/metrics/pairwise.py +++ b/dask_ml/metrics/pairwise.py @@ -1,6 +1,7 @@ """ Daskified versions of sklearn.metrics.pairwise """ + import warnings from typing import Any, Callable, Dict, Optional, Tuple, Union @@ -188,7 +189,7 @@ def sigmoid_kernel( "rbf": rbf_kernel, "linear": linear_kernel, "polynomial": polynomial_kernel, - "sigmoid": sigmoid_kernel + "sigmoid": sigmoid_kernel, # TODO: # - cosine_similarity # - laplacian diff --git a/dask_ml/model_selection/__init__.py b/dask_ml/model_selection/__init__.py index 1bd2a6240..1ae5fcb41 100644 --- a/dask_ml/model_selection/__init__.py +++ b/dask_ml/model_selection/__init__.py @@ -3,6 +3,7 @@ These estimators will operate in parallel. Their scalability depends on the underlying estimators being used. """ + from ._hyperband import HyperbandSearchCV from ._incremental import IncrementalSearchCV, InverseDecaySearchCV from ._search import GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits diff --git a/dask_ml/model_selection/_split.py b/dask_ml/model_selection/_split.py index 7ccd368a7..bc2ae064b 100644 --- a/dask_ml/model_selection/_split.py +++ b/dask_ml/model_selection/_split.py @@ -1,5 +1,6 @@ """Utilities for splitting datasets. """ + import itertools import logging import numbers @@ -436,9 +437,11 @@ def train_test_split( if da.Array in types and types & {dd.Series, dd.DataFrame}: if convert_mixed_types: arrays = tuple( - x.to_dask_array(lengths=True) - if isinstance(x, (dd.Series, dd.DataFrame)) - else x + ( + x.to_dask_array(lengths=True) + if isinstance(x, (dd.Series, dd.DataFrame)) + else x + ) for x in arrays ) else: diff --git a/dask_ml/preprocessing/__init__.py b/dask_ml/preprocessing/__init__.py index d2ebb19d5..0abe6b632 100644 --- a/dask_ml/preprocessing/__init__.py +++ b/dask_ml/preprocessing/__init__.py @@ -1,5 +1,6 @@ """Utilties for Preprocessing data. """ + from ._block_transformer import BlockTransformer from ._encoders import OneHotEncoder from .data import ( diff --git a/dask_ml/wrappers.py b/dask_ml/wrappers.py index 672f89e2f..26f0bd5c4 100644 --- a/dask_ml/wrappers.py +++ b/dask_ml/wrappers.py @@ -1,4 +1,5 @@ """Meta-estimators for parallelizing estimators using the scikit-learn API.""" + import logging import warnings diff --git a/dask_ml/xgboost.py b/dask_ml/xgboost.py index 86e841db3..d24009a5f 100644 --- a/dask_ml/xgboost.py +++ b/dask_ml/xgboost.py @@ -4,4 +4,5 @@ will be setup in distributed mode alongside your existing ``dask.distributed`` cluster. """ + from dask_xgboost import * # noqa diff --git a/tests/compose/test_column_transformer.py b/tests/compose/test_column_transformer.py index 05c98b93b..493178be8 100644 --- a/tests/compose/test_column_transformer.py +++ b/tests/compose/test_column_transformer.py @@ -17,7 +17,8 @@ @pytest.mark.skip( - reason="ValueError: Specifying the columns using strings is only supported for dataframes." + reason="ValueError: Specifying the columns using strings is only " + + "supported for dataframes." ) def test_column_transformer(): # Ordering of make_column_transformer was changed from @@ -61,7 +62,8 @@ def test_column_transformer(): @pytest.mark.skip( - reason="ValueError: Specifying the columns using strings is only supported for dataframes." + reason="ValueError: Specifying the columns using strings is only " + + "supported for dataframes." ) def test_column_transformer_unk_chunksize(): names = ["a", "b", "c"] @@ -93,7 +95,8 @@ def test_column_transformer_unk_chunksize(): @pytest.mark.skip( - reason="ValueError: Specifying the columns using strings is only supported for dataframes." + reason="ValueError: Specifying the columns using strings is only " + + "supported for dataframes." ) def test_sklearn_col_trans_disallows_hstack_then_block(): # Test that sklearn ColumnTransformer (to which dask-ml ColumnTransformer diff --git a/tests/linear_model/test_glm.py b/tests/linear_model/test_glm.py index 79a498e6a..1de8de496 100644 --- a/tests/linear_model/test_glm.py +++ b/tests/linear_model/test_glm.py @@ -236,7 +236,8 @@ def test_model_coef_dask_numpy(est, data, request): # fmt: off -@pytest.mark.skip(reason="AssertionError: Not equal to tolerance rtol=0.001, atol=0.0002") +@pytest.mark.skip( + reason="AssertionError: Not equal to tolerance rtol=0.001, atol=0.0002") @pytest.mark.parametrize("solver", ["newton", "lbfgs"]) @pytest.mark.parametrize("fit_intercept", [True, False]) @pytest.mark.parametrize( diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 99dbfd4db..e4a79462f 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -149,7 +149,8 @@ def test_log_loss_shape(yhat): @pytest.mark.skip( - reason="FutureWarning: The `needs_threshold` and `needs_proba` parameter are deprecated" + reason="FutureWarning: The `needs_threshold` and `needs_proba` " + + "parameter are deprecated" ) @pytest.mark.parametrize("y", [[0, 1, 1, 0], [0, 1, 2, 0]]) def test_log_loss_scoring(y): diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py index eca59b3ee..af775e168 100644 --- a/tests/metrics/test_regression.py +++ b/tests/metrics/test_regression.py @@ -65,7 +65,8 @@ def test_mse_squared(squared): @pytest.mark.skip( - reason="InvalidParameterError: The 'multioutput' parameter of mean_squared_error must be a string among..." + reason="InvalidParameterError: The 'multioutput' parameter of mean_squared_error " + + "must be a string among..." ) @pytest.mark.parametrize("multioutput", ["uniform_average", None]) def test_regression_metrics_unweighted_average_multioutput(metric_pairs, multioutput): diff --git a/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py b/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py index 84eea2b9f..9bc8e37be 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py +++ b/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py @@ -120,7 +120,8 @@ def test_hyperparameter_searcher_with_fit_params(cls, kwargs): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") def test_grid_search_no_score(): @@ -156,7 +157,8 @@ def test_grid_search_no_score(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) def test_grid_search_score_method(): X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) @@ -187,7 +189,8 @@ def test_grid_search_score_method(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV @@ -272,7 +275,8 @@ def test_return_train_score_warn(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") def test_classes__property(): @@ -415,7 +419,8 @@ def test_grid_search_bad_param_grid(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) def test_grid_search_sparse(): # Test that grid search works with both dense and sparse matrices @@ -439,7 +444,8 @@ def test_grid_search_sparse(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) @@ -1010,7 +1016,8 @@ def test_search_cv_results_none_param(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") def test_grid_search_correct_score_results(): @@ -1221,7 +1228,8 @@ def test_grid_search_failing_classifier_raise(): @pytest.mark.skip( - reason="FutureWarning: The default value of `dual` will change from `True` to `'auto'`" + reason="FutureWarning: The default value of `dual` will change from " + + "`True` to `'auto'`" ) def test_search_train_scores_set_to_false(): X = np.arange(6).reshape(6, -1) diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py index 9e7c92eb0..f696ac2e1 100644 --- a/tests/model_selection/test_incremental.py +++ b/tests/model_selection/test_incremental.py @@ -45,7 +45,8 @@ @pytest.mark.skip( - reason="yaml.representer.RepresenterError: ('cannot represent an object', 0.05006666249977221)" + reason="yaml.representer.RepresenterError: " + + "('cannot represent an object', 0.05006666249977221)" ) @gen_cluster(client=True, timeout=1000) async def test_basic(c, s, a, b): diff --git a/tests/model_selection/test_split.py b/tests/model_selection/test_split.py index d1301e9ef..21ccb13f9 100644 --- a/tests/model_selection/test_split.py +++ b/tests/model_selection/test_split.py @@ -14,7 +14,8 @@ @pytest.mark.skip( - reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must be an instance of 'bool' or an instance of 'numpy.bool_'" + reason="InvalidParameterError: The 'shuffle' parameter of train_test_split " + + "must be an instance of 'bool' or an instance of 'numpy.bool_'" ) def test_20_newsgroups(): data = fetch_20newsgroups() @@ -246,7 +247,8 @@ def test_split_mixed(): @pytest.mark.skip( - reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must be an instance of 'bool' or an instance of 'numpy.bool_'" + reason="InvalidParameterError: The 'shuffle' parameter of train_test_split must " + + "be an instance of 'bool' or an instance of 'numpy.bool_'" ) def test_split_3d_data(): X_3d = np.arange(1.0, 5001.0).reshape((100, 10, 5)) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index c49f7f6c2..711245b41 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -128,7 +128,8 @@ def test_inverse_transform(self): assert_eq_ar(result, X) @pytest.mark.skip( - reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword argument 'columns'" + reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword " + + "argument 'columns'" ) @pytest.mark.xfail(reason="removed columns") def test_df_inverse_transform(self): @@ -139,7 +140,8 @@ def test_df_inverse_transform(self): assert_eq_df(result, df2) @pytest.mark.skip( - reason="AssertionError: found values in 'a' and 'b' which differ by more than the allowed amount" + reason="AssertionError: found values in 'a' and 'b' which differ by more " + + "than the allowed amount" ) def test_df_values(self): est1 = dpp.MinMaxScaler() @@ -158,7 +160,8 @@ def test_df_values(self): assert_eq_ar(result_ar, result_df) @pytest.mark.skip( - reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword argument 'columns'" + reason=" TypeError: MinMaxScaler.__init__() got an unexpected keyword " + + "argument 'columns'" ) @pytest.mark.xfail(reason="removed columns") def test_df_column_slice(self): @@ -211,7 +214,8 @@ def test_inverse_transform(self): assert_eq_ar(result, X) @pytest.mark.skip( - reason="DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`" + reason="DeprecationWarning: np.find_common_type is deprecated. Please use " + + "`np.result_type` or `np.promote_types`" ) def test_df_values(self): est1 = dpp.RobustScaler() @@ -354,7 +358,8 @@ def test_raises(self): class TestDummyEncoder: @pytest.mark.skip( - reason='AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="A_a") are different' + reason="AssertionError: Attributes of " + + 'DataFrame.iloc[:, 1] (column name="A_a") are different' ) @pytest.mark.parametrize("daskify", [False, True]) @pytest.mark.parametrize("values", [True, False]) @@ -634,7 +639,8 @@ def test_transformer_params(self): mark = pytest.mark.xfail( DASK_EXPR_ENABLED, - reason="dask-expr: NotImplementedError in assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)", + reason="dask-expr: NotImplementedError in " + + "assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)", ) @pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False]) diff --git a/tests/test_incremental_pca.py b/tests/test_incremental_pca.py index f769dcf56..ea55914b1 100644 --- a/tests/test_incremental_pca.py +++ b/tests/test_incremental_pca.py @@ -1,4 +1,5 @@ """Tests for Incremental PCA.""" + import numpy as np import pytest from dask import array as da @@ -143,7 +144,8 @@ def test_incremental_pca_inverse(): @pytest.mark.skip(reason="AssertionError: Regex pattern did not match.") @pytest.mark.skip( - reason="InvalidParameterError: The 'min_batch_size' parameter of gen_batches must be an int in the range [0, inf). Got -1 instead." + reason="InvalidParameterError: The 'min_batch_size' parameter of " + + "gen_batches must be an int in the range [0, inf). Got -1 instead." ) def test_incremental_pca_validation(): # Test that n_components is >=1 and <= n_features. diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py index 214eabcf1..3aecfe009 100644 --- a/tests/test_kmeans.py +++ b/tests/test_kmeans.py @@ -2,6 +2,7 @@ Mostly just smoke tests, and verifying that the parallel implementation is the same as the serial. """ + import warnings import dask.array as da @@ -88,7 +89,8 @@ def test_basic(self, Xl_blobs_easy): assert_eq(yhat_a.compute(), yhat_b) @pytest.mark.skip( - reason="TypeError: _kmeans_plusplus() missing 1 required positional argument: 'random_state'" + reason="TypeError: _kmeans_plusplus() missing 1 required positional " + + "argument: 'random_state'" ) def test_fit_given_init(self): X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1) @@ -104,7 +106,8 @@ def test_fit_given_init(self): assert abs(skkm.inertia_ - dkkm.inertia_) < 0.001 @pytest.mark.skip( - reason="TypeError: _kmeans_plusplus() missing 1 required positional argument: 'sample_weight'" + reason="TypeError: _kmeans_plusplus() missing 1 required positional " + + "argument: 'sample_weight'" ) def test_kmeanspp_init(self, Xl_blobs_easy): X, y = Xl_blobs_easy @@ -118,7 +121,8 @@ def test_kmeanspp_init(self, Xl_blobs_easy): assert dkkm.init == "k-means++" @pytest.mark.skip( - reason="TypeError: _kmeans_plusplus() missing 1 required positional argument: 'sample_weight'" + reason="TypeError: _kmeans_plusplus() missing 1 required positional " + + "argument: 'sample_weight'" ) def test_kmeanspp_init_random_state(self, Xl_blobs_easy): X, y = Xl_blobs_easy diff --git a/tests/test_normalize.py b/tests/test_normalize.py index efa754b63..2e2c2f68b 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -19,7 +19,8 @@ def test_normalize_estimator(): @pytest.mark.skip( - reason="AssertionError: assert 'a2cbc95dc37b...02448a0397857' == '711147ceec8c...bc712ab6dfbce'" + reason="AssertionError: assert " + + "'a2cbc95dc37b...02448a0397857' == '711147ceec8c...bc712ab6dfbce'" ) def test_normalize_estimator_cv(): param_grid = {"C": [0.01]} diff --git a/tests/test_pca.py b/tests/test_pca.py index b7b008c4a..df81c6769 100644 --- a/tests/test_pca.py +++ b/tests/test_pca.py @@ -421,7 +421,8 @@ def test_randomized_pca_check_projection(): @pytest.mark.skip( - reason="TypeError: Got an unsupported type (). Dask-ML's PCA only support Dask Arrays or DataFrames." + reason="TypeError: Got an unsupported type (). Dask-ML's PCA " + + "only support Dask Arrays or DataFrames." ) @pytest.mark.xfail(reason="chunks") def test_randomized_pca_check_list(): @@ -464,7 +465,8 @@ def test_randomized_pca_inverse(): @pytest.mark.skip( - reason="TypeError: Got an unsupported type (). Dask-ML's PCA only support Dask Arrays or DataFrames." + reason="TypeError: Got an unsupported type (). Dask-ML's " + + "PCA only support Dask Arrays or DataFrames." ) @pytest.mark.xfail(reason="MLE") def test_pca_dim(): diff --git a/tests/test_svd.py b/tests/test_svd.py index 72ea44dc7..a2da0b87e 100644 --- a/tests/test_svd.py +++ b/tests/test_svd.py @@ -1,4 +1,5 @@ """Test truncated SVD transformer.""" + import dask.array as da import numpy as np import pytest