Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dask] [python-package] include support for column array as label #3943

Merged
merged 16 commits into from
Feb 24, 2021
Merged
24 changes: 20 additions & 4 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,21 @@ def is_numpy_1d_array(data):
return isinstance(data, np.ndarray) and len(data.shape) == 1


def is_numpy_column_array(data):
"""Check whether data is a column numpy array."""
if not isinstance(data, np.ndarray):
return False
shape = data.shape
return len(shape) == 2 and shape[1] == 1


def numpy_1d_array_to_dtype(array, dtype):
"""Convert 1d array to dtype."""
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
if array.dtype == dtype:
return array
return array.astype(dtype=dtype, copy=False)


def is_1d_list(data):
"""Check whether data is a 1-D list."""
return isinstance(data, list) and (not data or is_numeric(data[0]))
Expand All @@ -134,10 +149,11 @@ def is_1d_list(data):
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""Convert data to numpy 1-D array."""
if is_numpy_1d_array(data):
if data.dtype == dtype:
return data
else:
return data.astype(dtype=dtype, copy=False)
return numpy_1d_array_to_dtype(data, dtype)
elif is_numpy_column_array(data):
_log_warning('Converting column-vector to 1d array')
array = data.ravel()
return numpy_1d_array_to_dtype(array, dtype)
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, pd_Series):
Expand Down
31 changes: 31 additions & 0 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm.compat import PANDAS_INSTALLED, pd_Series

from .utils import load_breast_cancer

Expand Down Expand Up @@ -375,3 +376,33 @@ def test_choose_param_value():
"num_trees": 81
}
assert original_params == expected_params


@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
@pytest.mark.parametrize(
'y',
[
np.random.rand(10),
np.random.rand(10, 1),
pd_Series(np.random.rand(10)),
pd_Series(['a', 'b']),
[1] * 10,
[[1], [2]]
])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
def test_list_to_1d_numpy(y, dtype):
if isinstance(y, np.ndarray) and len(y.shape) == 2:
with pytest.warns(UserWarning, match='column-vector'):
lgb.basic.list_to_1d_numpy(y)
return
elif isinstance(y, list) and isinstance(y[0], list):
with pytest.raises(TypeError):
lgb.basic.list_to_1d_numpy(y)
return
elif isinstance(y, pd_Series) and y.dtype == object:
with pytest.raises(ValueError):
lgb.basic.list_to_1d_numpy(y)
return
result = lgb.basic.list_to_1d_numpy(y, dtype=dtype)
assert result.size == 10
assert result.dtype == dtype
39 changes: 39 additions & 0 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,45 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
assert dask_params[param].default == sklearn_params[param].default, error_msg


@pytest.mark.parametrize('task', tasks)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
task,
client,
listen_port
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
):
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
if task == 'ranking':
_, _, _, _, dX, dy, dw, dg = _create_ranking_data(
output='dataframe',
group=None
)
model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, dw = _create_data(
objective=task,
output='dataframe',
)
dg = None
if task == 'classification':
model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
model_factory = lgb.DaskLGBMRegressor
dy = dy.to_dask_array(lengths=True)
dy_col_array = dy.reshape(-1, 1)
assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1

params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0,
'local_listen_port': listen_port,
'time_out': 5
}
model = model_factory(**params)
model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
assert model.fitted_
client.close(timeout=CLIENT_CLOSE_TIMEOUT)


def sklearn_checks_to_run():
check_names = [
"check_estimator_get_tags_default_keys",
Expand Down
35 changes: 34 additions & 1 deletion tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import lightgbm as lgb

from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud
from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking

sk_version = parse_version(sk_version)
if sk_version < parse_version("0.23"):
Expand Down Expand Up @@ -1192,3 +1192,36 @@ def test_parameters_default_constructible(estimator):
name, Estimator = estimator.__class__.__name__, estimator.__class__
# Test that estimators are default-constructible
check_parameters_default_constructible(name, Estimator)


@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas")
if task == 'ranking':
X, y, g = make_ranking()
g = np.bincount(g)
model_factory = lgb.LGBMRanker
elif task == 'classification':
X, y = load_iris(return_X_y=True)
model_factory = lgb.LGBMClassifier
elif task == 'regression':
X, y = load_boston(return_X_y=True)
model_factory = lgb.LGBMRegressor
X = pd.DataFrame(X)
y_col_array = y.reshape(-1, 1)
params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0
}
with pytest.warns(UserWarning, match='column-vector'):
if task == 'ranking':
model_1d = model_factory(**params).fit(X, y, group=g)
model_2d = model_factory(**params).fit(X, y_col_array, group=g)
else:
model_1d = model_factory(**params).fit(X, y)
model_2d = model_factory(**params).fit(X, y_col_array)

preds_1d = model_1d.predict(X)
preds_2d = model_2d.predict(X)
assert np.allclose(preds_1d, preds_2d)
jmoralez marked this conversation as resolved.
Show resolved Hide resolved