From f37b868b2e5a3bb6c09cc89171ada3eb9a9eda13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 30 May 2022 15:49:39 -0500 Subject: [PATCH 1/2] dont copy dataframe on rename --- python-package/lightgbm/basic.py | 2 +- tests/python_package_test/test_basic.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 5e609bca45f9..05a4eb4c69c3 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -537,7 +537,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.') if feature_name == 'auto' or feature_name is None: - data = data.rename(columns=str) + data = data.rename(columns=str, copy=False) cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] if pandas_categorical is None: # train dataset diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d290bcb7216c..4d409ca96310 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -648,9 +648,7 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype): pd = pytest.importorskip('pandas') X = np.random.rand(10, 2).astype(dtype) df = pd.DataFrame(X) - # feature names are required to not make a copy (rename makes a copy) - feature_name = ['x1', 'x2'] - built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] + built_data = lgb.basic._data_from_pandas(df, 'auto', None, None)[0] assert built_data.dtype == dtype assert np.shares_memory(X, built_data) @@ -659,7 +657,7 @@ def test_categorical_code_conversion_doesnt_modify_original_data(): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) df = pd.DataFrame(X.copy(), columns=['x1'], dtype='category') - data = lgb.basic._data_from_pandas(df, ['x1'], None, None)[0] + data = lgb.basic._data_from_pandas(df, 'auto', None, None)[0] # check that the original data wasn't modified np.testing.assert_equal(df['x1'], X[:, 0]) # check that the built data has the codes From 072ea474877b1b365c5e1e6f4b89228aeaf3507a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 31 May 2022 23:05:34 -0500 Subject: [PATCH 2/2] test with feature_name and 'auto' --- tests/python_package_test/test_basic.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 4d409ca96310..e9b47a820757 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -644,20 +644,22 @@ def test_custom_objective_safety(): @pytest.mark.parametrize('dtype', [np.float32, np.float64]) -def test_no_copy_when_single_float_dtype_dataframe(dtype): +@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto']) +def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): pd = pytest.importorskip('pandas') X = np.random.rand(10, 2).astype(dtype) df = pd.DataFrame(X) - built_data = lgb.basic._data_from_pandas(df, 'auto', None, None)[0] + built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] assert built_data.dtype == dtype assert np.shares_memory(X, built_data) -def test_categorical_code_conversion_doesnt_modify_original_data(): +@pytest.mark.parametrize('feature_name', [['x1'], 'auto']) +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) df = pd.DataFrame(X.copy(), columns=['x1'], dtype='category') - data = lgb.basic._data_from_pandas(df, 'auto', None, None)[0] + data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] # check that the original data wasn't modified np.testing.assert_equal(df['x1'], X[:, 0]) # check that the built data has the codes