From 155f10f2584467c4db2e02fab0d0230159140a2b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 11 Feb 2021 19:03:22 -0800 Subject: [PATCH 01/35] fix issues with updating to latest pandas --- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/column/categorical.py | 6 ++++ python/cudf/cudf/core/column/column.py | 4 +-- python/cudf/cudf/core/column/datetime.py | 33 ++++++++++++++------- python/cudf/cudf/core/column/timedelta.py | 15 ++++------ python/cudf/cudf/tests/test_dataframe.py | 8 ++++- python/cudf/cudf/tests/test_datetime.py | 10 +++++-- python/cudf/cudf/tests/test_reductions.py | 22 +++++++++++--- python/cudf/cudf/tests/test_replace.py | 29 ++++++++++++------ python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 14 ++------- 11 files changed, 93 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e18a204eedb..de19acf9ba4 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -6,3 +6,4 @@ PANDAS_VERSION = version.parse(pd.__version__) PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0") PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1") +PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2.0") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 01c8dfb5f1b..a1d4a2668dd 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1204,6 +1204,12 @@ def fillna( raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) + if isinstance(fill_value, CategoricalColumn): + if self.dtype != fill_value.dtype: + raise ValueError( + "Cannot set a Categorical with another, " + "without identical categories" + ) # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value.cat()._set_categories( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d615a7cfae4..2344b785dbf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1236,7 +1236,7 @@ def sum( def product( self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 ): - raise TypeError(f"cannot perform prod with type {self.dtype}") + raise TypeError(f"cannot perform product with type {self.dtype}") def mean(self, skipna: bool = None, dtype: Dtype = None): raise TypeError(f"cannot perform mean with type {self.dtype}") @@ -1248,7 +1248,7 @@ def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform var with type {self.dtype}") def kurtosis(self, skipna: bool = None): - raise TypeError(f"cannot perform kurt with type {self.dtype}") + raise TypeError(f"cannot perform kurtosis with type {self.dtype}") def skew(self, skipna: bool = None): raise TypeError(f"cannot perform skew with type {self.dtype}") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 6029052c1d3..1d706687489 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -13,11 +13,17 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike +from cudf.core._compat import PANDAS_GE_120 from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import is_scalar from cudf.utils.utils import _fillna_natwise +if PANDAS_GE_120: + _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format +else: + _guess_datetime_format = pd.core.tools.datetimes._guess_datetime_format + # nanoseconds per time_unit _numpy_to_pandas_conversion = { "ns": 1, @@ -235,6 +241,19 @@ def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: unit=self.time_unit, ) + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> pd.Timedelta: + return pd.Timedelta( + self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) + * _numpy_to_pandas_conversion[self.time_unit], + ) + + def median(self, skipna: bool = None) -> pd.Timestamp: + return pd.Timestamp( + self.as_numerical.median(skipna=skipna), unit=self.time_unit + ) + def quantile( self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> ColumnBase: @@ -375,7 +394,7 @@ def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` """ - fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs) + fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: return fmt @@ -389,15 +408,11 @@ def infer_format(element: str, **kwargs) -> str: second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1) subsecond_fmt = ".%" + str(len(second_parts[0])) + "f" - first_part = pd.core.tools.datetimes._guess_datetime_format( - element_parts[0], **kwargs - ) + first_part = _guess_datetime_format(element_parts[0], **kwargs) # For the case where first_part is '00:00:03' if first_part is None: tmp = "1970-01-01 " + element_parts[0] - first_part = pd.core.tools.datetimes._guess_datetime_format( - tmp, **kwargs - ).split(" ", 1)[1] + first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1] if first_part is None: raise ValueError("Unable to infer the timestamp format from the data") @@ -411,9 +426,7 @@ def infer_format(element: str, **kwargs) -> str: if len(second_part) > 1: # Only infer if second_parts is not an empty string. - second_part = pd.core.tools.datetimes._guess_datetime_format( - second_part, **kwargs - ) + second_part = _guess_datetime_format(second_part, **kwargs) else: second_part = "" diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 75509df4ec6..17222f16673 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -380,15 +380,12 @@ def quantile( def sum( self, skipna: bool = None, dtype: Dtype = None, min_count=0 ) -> pd.Timedelta: - if len(self) == 0: - return pd.Timedelta(None, unit=self.time_unit) - else: - return pd.Timedelta( - self.as_numerical.sum( - skipna=skipna, dtype=dtype, min_count=min_count - ), - unit=self.time_unit, - ) + return pd.Timedelta( + self.as_numerical.sum( + skipna=skipna, dtype=dtype, min_count=min_count + ), + unit=self.time_unit, + ) def std( self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d8005911fcd..78105561729 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -72,7 +72,13 @@ def test_init_via_list_of_empty_tuples(rows): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) - assert_eq(pdf, gdf, check_like=True) + assert_eq( + pdf, + gdf, + check_like=True, + check_column_type=False, + check_index_type=False, + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 044c8bd5954..cffe640d1f9 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1182,7 +1182,7 @@ def test_datetime_stats(data, dtype, stat): assert_eq(expected, actual) -@pytest.mark.parametrize("op", ["max", "min"]) +@pytest.mark.parametrize("op", ["max", "min", "std", "median"]) @pytest.mark.parametrize( "data", [ @@ -1201,10 +1201,14 @@ def test_datetime_reductions(data, op, dtype): actual = getattr(sr, op)() expected = getattr(psr, op)() - if np.isnat(expected.to_numpy()) and np.isnat(actual): + if ( + expected is pd.NaT + and actual is pd.NaT + or (np.isnat(expected.to_numpy()) and np.isnat(actual)) + ): assert True else: - assert_eq(expected.to_numpy(), actual) + assert_eq(expected, actual) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index faf895b8f42..323f8c62892 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -3,6 +3,7 @@ from __future__ import division, print_function import random +import re from itertools import product import numpy as np @@ -166,15 +167,20 @@ def test_date_minmax(): @pytest.mark.parametrize( - "op", - ["sum", "product", "std", "var", "median", "kurt", "kurtosis", "skew"], + "op", ["sum", "product", "var", "kurt", "kurtosis", "skew"], ) def test_datetime_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") psr = gsr.to_pandas() utils.assert_exceptions_equal( - lfunc=getattr(psr, op), rfunc=getattr(gsr, op), + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + expected_error_message=re.escape( + "cannot perform " + + ("kurtosis" if op == "kurt" else op) + + " with type datetime64[ns]" + ), ) @@ -183,7 +189,15 @@ def test_timedelta_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") psr = gsr.to_pandas() - utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op)) + utils.assert_exceptions_equal( + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + expected_error_message=re.escape( + "cannot perform " + + ("kurtosis" if op == "kurt" else op) + + " with type timedelta64[ns]" + ), + ) @pytest.mark.parametrize("op", ["sum", "product", "std", "var"]) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index f4713b19015..a7f4d1a527a 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -333,7 +333,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): @pytest.mark.parametrize( - "psr", + "psr_data", [ pd.Series(["a", "b", "a", None, "c", None], dtype="category"), pd.Series( @@ -373,8 +373,8 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_categorical(psr, fill_value, inplace): - +def test_fillna_categorical(psr_data, fill_value, inplace): + psr = psr_data.copy() gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): @@ -382,14 +382,25 @@ def test_fillna_categorical(psr, fill_value, inplace): else: fill_value_cudf = fill_value - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) + if ( + isinstance(fill_value_cudf, cudf.Series) + and gsr.dtype != fill_value_cudf.dtype + ): + assert_exceptions_equal( + lfunc=psr.fillna, + rfunc=gsr.fillna, + lfunc_args_and_kwargs=([fill_value], {"inplace": inplace}), + rfunc_args_and_kwargs=([fill_value_cudf], {"inplace": inplace}), + ) + else: + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) - if inplace: - expected = psr - got = gsr + if inplace: + expected = psr + got = gsr - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 080420c8f75..f2748f5053c 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2843,7 +2843,7 @@ def test_string_product(): lfunc=psr.product, rfunc=sr.product, expected_error_message=re.escape( - f"cannot perform prod with type {sr.dtype}" + f"cannot perform product with type {sr.dtype}" ), ) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 3b625a5ad85..0b886dcef9e 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -597,18 +597,8 @@ def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): [1000000, 200000, 3000000], [1000000, 200000, None], [], - pytest.param( - [None], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35644" - ), - ), - pytest.param( - [None, None, None, None, None], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35644" - ), - ), + [None], + [None, None, None, None, None], [12, 12, 22, 343, 4353534, 435342], np.array([10, 20, 30, None, 100]), cp.asarray([10, 20, 30, 100]), From 454ecf5c2d5f6e24d01433617e93cd305eed78a1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 12 Feb 2021 12:32:57 -0800 Subject: [PATCH 02/35] remove xfails and fix issues --- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 20 +-- python/cudf/cudf/tests/test_dropna.py | 30 ++-- python/cudf/cudf/tests/test_index.py | 23 --- python/cudf/cudf/tests/test_json.py | 9 +- python/cudf/cudf/tests/test_replace.py | 12 +- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_string.py | 177 ++++++++--------------- python/cudf/cudf/tests/test_timedelta.py | 28 +--- 9 files changed, 89 insertions(+), 214 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 21f504ea684..aa5172a9a89 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5189,7 +5189,7 @@ def _get_cols_list(parent_obj, others): ] return cols_list - elif others is not None: + elif others is not None and not isinstance(others, StringMethods): if ( parent_index is not None and isinstance(others, cudf.Series) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 78105561729..a93796e46f7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1860,6 +1860,9 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) gsr = gd.DataFrame(data) + if psr.shape[0] * psr.shape[1] < min_count: + pytest.xfail("https://github.com/pandas-dev/pandas/issues/39738") + assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), getattr(gsr, ops)(skipna=skipna, min_count=min_count), @@ -4294,16 +4297,11 @@ def test_isin_dataframe(data, values): rfunc_args_and_kwargs=([values],), ) else: - try: - expected = pdf.isin(values) - except ValueError as e: - if str(e) == "Lengths must match.": - # xref https://github.com/pandas-dev/pandas/issues/34256 - pytest.xfail( - "https://github.com/pandas-dev/pandas/issues/34256" - ) + expected = pdf.isin(values) + if isinstance(values, (pd.DataFrame, pd.Series)): values = gd.from_pandas(values) + got = gdf.isin(values) assert_eq(got, expected) @@ -4907,17 +4905,13 @@ def test_rowwise_ops_datetime_dtypes_2(data, op, skipna): ], ) def test_rowwise_ops_datetime_dtypes_pdbug(data): - """ - Pandas bug: https://github.com/pandas-dev/pandas/issues/36907 - """ pdf = pd.DataFrame(data) gdf = gd.from_pandas(pdf) expected = pdf.max(axis=1, skipna=False) got = gdf.max(axis=1, skipna=False) - with pytest.raises(AssertionError, match="numpy array are different"): - assert_eq(got, expected) + assert_eq(got, expected) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 08378361188..ddd569acf0d 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -40,14 +40,12 @@ def test_dropna_series(data, nulls, inplace): if gsr.null_count == len(gsr): check_dtype = False + expected = psr.dropna() + actual = gsr.dropna() + if inplace: - psr.dropna() - gsr.dropna() expected = psr actual = gsr - else: - expected = psr.dropna() - actual = gsr.dropna() assert_eq(expected, actual, check_dtype=check_dtype) @@ -71,14 +69,12 @@ def test_dropna_dataframe(data, how, axis, inplace): pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) + expected = pdf.dropna(axis=axis, how=how, inplace=inplace) + actual = gdf.dropna(axis=axis, how=how, inplace=inplace) + if inplace: - pdf.dropna(axis=axis, how=how, inplace=inplace) - gdf.dropna(axis=axis, how=how, inplace=inplace) expected = pdf actual = gdf - else: - expected = pdf.dropna(axis=axis, how=how, inplace=inplace) - actual = gdf.dropna(axis=axis, how=how, inplace=inplace) assert_eq(expected, actual) @@ -192,18 +188,14 @@ def test_dropna_thresh_cols(thresh, subset, inplace): ) gdf = cudf.from_pandas(pdf) + expected = pdf.dropna( + axis=1, thresh=thresh, subset=subset, inplace=inplace + ) + actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) + if inplace: - pdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) - gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) expected = pdf actual = gdf - else: - expected = pdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) - actual = gdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) assert_eq( expected, actual, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f806b0a912c..38d3bb0542d 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -798,14 +798,6 @@ def test_index_difference(data, other, sort): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" - and gd_other.dtype.kind != "f" - or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") - ): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" - ) expected = pd_data.difference(pd_other, sort=sort) actual = gd_data.difference(gd_other, sort=sort) assert_eq(expected, actual) @@ -864,12 +856,6 @@ def test_index_equals(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" or gd_other.dtype.kind == "f" - ) and cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" - ) expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) @@ -916,15 +902,6 @@ def test_index_categories_equal(data, other): gd_data = cudf.core.index.as_index(data).astype("category") gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" - and gd_other.dtype.kind != "f" - or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") - ): - pytest.xfail( - "Bug in Pandas: https://github.com/pandas-dev/pandas/issues/35217" - ) - expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index e032309bdbd..fe365f4e120 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -133,14 +133,7 @@ def test_json_writer(tmpdir, pdf, gdf): assert os.path.exists(pdf_series_fname) assert os.path.exists(gdf_series_fname) - try: - # xref 'https://github.com/pandas-dev/pandas/pull/33373') - expect_series = pd.read_json(pdf_series_fname, typ="series") - except TypeError as e: - if str(e) == " is not convertible to datetime": - continue - else: - raise e + expect_series = pd.read_json(pdf_series_fname, typ="series") got_series = pd.read_json(gdf_series_fname, typ="series") assert_eq(expect_series, got_series) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index a7f4d1a527a..e7baa4ee926 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -374,7 +374,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): ) @pytest.mark.parametrize("inplace", [True, False]) def test_fillna_categorical(psr_data, fill_value, inplace): - psr = psr_data.copy() + psr = psr_data.copy(deep=True) gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): @@ -404,7 +404,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): @pytest.mark.parametrize( - "psr", + "psr_data", [ pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")), pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), @@ -486,7 +486,8 @@ def test_fillna_categorical(psr_data, fill_value, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_datetime(psr, fill_value, inplace): +def test_fillna_datetime(psr_data, fill_value, inplace): + psr = psr_data.copy(deep=True) gsr = cudf.from_pandas(psr) if isinstance(fill_value, pd.Series): @@ -645,7 +646,7 @@ def test_fillna_dataframe(df, value, inplace): @pytest.mark.parametrize( - "psr", + "ps_data", [ pd.Series(["a", "b", "c", "d"]), pd.Series([None] * 4, dtype="object"), @@ -666,7 +667,8 @@ def test_fillna_dataframe(df, value, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_string(psr, fill_value, inplace): +def test_fillna_string(ps_data, fill_value, inplace): + psr = ps_data.copy(deep=True) gsr = cudf.from_pandas(psr) if isinstance(fill_value, pd.Series): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 1b628142939..abe641c1943 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -114,7 +114,7 @@ def test_series_set_item(psr, arg): ], ) def test_setitem_dataframe_series_inplace(df): - pdf = df + pdf = df.copy(deep=True) gdf = cudf.from_pandas(pdf) pdf["a"].replace(1, 500, inplace=True) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index f2748f5053c..997249e3140 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -340,18 +340,8 @@ def _cat_convert_seq_to_cudf(others): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -376,36 +366,26 @@ def _cat_convert_seq_to_cudf(others): pd.Series(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), ), - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - pytest.param( - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], [ pd.Series(["hello", "world", "abc", "xyz", "pqr"]), pd.Series(["abc", "xyz", "hello", "pqr", "world"]), @@ -507,18 +487,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -531,36 +501,26 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["f", "g", "h", "i", "j"]), ], - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - pytest.param( - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], [ pd.Series( ["hello", "world", "abc", "xyz", "pqr"], @@ -611,16 +571,7 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): @pytest.mark.parametrize( - "data", - [ - pytest.param( - ["a", None, "c", None, "e"], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5862" - ), - ), - ["a", "b", "c", "d", "a"], - ], + "data", [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], ) @pytest.mark.parametrize( "others", @@ -628,18 +579,8 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): None, ["f", "g", "h", "i", "j"], pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), [ np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -732,16 +673,20 @@ def test_string_index_duplicate_str_cat(data, others, sep, na_rep, name): ) -@pytest.mark.xfail(raises=ValueError) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -def test_string_cat_str(ps_gs, sep, na_rep): - ps, gs = ps_gs - - got = gs.str.cat(gs.str, sep=sep, na_rep=na_rep) - expect = ps.str.cat(ps.str, sep=sep, na_rep=na_rep) - - assert_eq(expect, got) +def test_string_cat_str_error(): + gs = cudf.Series(["a", "v", "s"]) + # https://github.com/pandas-dev/pandas/issues/28277 + # ability to pass StringMethods is being removed in future. + with pytest.raises( + TypeError, + match=re.escape( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ), + ): + gs.str.cat(gs.str) @pytest.mark.xfail(raises=(NotImplementedError, AttributeError)) @@ -847,10 +792,6 @@ def test_string_upper(ps_gs): @pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) @pytest.mark.parametrize("expand", [True, False, None]) def test_string_split(data, pat, n, expand): - - if data in (["a b", " c ", " d", "e ", "f"],) and pat is None: - pytest.xfail("None pattern split algorithm not implemented yet") - ps = pd.Series(data, dtype="str") gs = Series(data, dtype="str") diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 0b886dcef9e..85a4b6f70b6 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -434,19 +434,7 @@ def test_timedelta_dataframe_ops(df, op): ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) @pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "mod", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" - ), - ), - ], + "op", ["add", "sub", "truediv", "mod", "floordiv"], ) def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): gsr = cudf.Series(data=data, dtype=dtype) @@ -534,19 +522,7 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) @pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "mod", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" - ), - ), - ], + "op", ["add", "sub", "truediv", "mod", "floordiv"], ) def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): gpu_scalar = cudf.Scalar(cpu_scalar) From 303c77da5e23032441a605654529af1793f77143 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 22 Feb 2021 11:03:02 -0800 Subject: [PATCH 03/35] fix isin and misc tests --- python/cudf/cudf/core/column/categorical.py | 46 +++++++++++++++++++++ python/cudf/cudf/core/column/column.py | 33 +++++++-------- python/cudf/cudf/core/column/datetime.py | 31 ++++++++++++++ python/cudf/cudf/core/column/numerical.py | 29 +++++++++++++ python/cudf/cudf/core/column/timedelta.py | 35 ++++++++++++++++ python/cudf/cudf/tests/test_csv.py | 1 - python/cudf/cudf/tests/test_dataframe.py | 22 ++++------ python/cudf/cudf/tests/test_datetime.py | 3 -- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 1 - python/cudf/cudf/tests/test_joining.py | 4 -- python/cudf/cudf/tests/test_numerical.py | 10 +---- python/cudf/cudf/tests/test_ops.py | 1 + python/cudf/cudf/tests/test_repr.py | 3 +- python/cudf/cudf/tests/test_reshape.py | 5 +-- python/cudf/cudf/tests/test_setitem.py | 19 ++++++--- python/cudf/cudf/tests/test_sorting.py | 5 ++- python/cudf/cudf/tests/test_timedelta.py | 3 ++ 18 files changed, 189 insertions(+), 64 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a1d4a2668dd..7c4fa42b93a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -9,6 +9,7 @@ Dict, Mapping, Optional, + Sequence, Tuple, Union, cast, @@ -867,6 +868,35 @@ def set_base_data(self, value): else: super().set_base_data(value) + def isin(self, values: Sequence) -> ColumnBase: + if cudf.utils.dtypes.is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" + ) + + lhs = self + rhs = None + + try: + # We need to convert values to same type as self, + # hence passing dtype=self.dtype + rhs = cudf.core.column.as_column(values, dtype=self.dtype) + + if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: + return cudf.core.column.full(len(self), False, dtype="bool") + + # Short-circuit if rhs is all null. + if lhs.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(self), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + return res + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None @@ -936,6 +966,22 @@ def unary_operator(self, unaryop: str): ) def __setitem__(self, key, value): + if cudf.utils.dtypes.is_scalar(value): + new_values = [value] + else: + new_values = value + + to_add_categories = cudf.Index(new_values).difference(self.categories) + + if ( + len(to_add_categories) + and not to_add_categories.isna()._values.all() + ): + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) + if cudf.utils.dtypes.is_scalar(value): value = self._encode(value) if value is not None else value else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2344b785dbf..7daf8143338 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -855,9 +855,14 @@ def isin(self, values: Sequence) -> ColumnBase: rhs = None try: - # We need to convert values to same type as self, - # hence passing dtype=self.dtype - rhs = as_column(values, dtype=self.dtype) + rhs = as_column(values, nan_as_null=False) + if lhs.null_count == len(lhs): + lhs = lhs.astype(rhs.dtype) + elif rhs.null_count == len(rhs): + rhs = rhs.astype(lhs.dtype) + + if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: + return full(len(self), False, dtype="bool") # Short-circuit if rhs is all null. if lhs.null_count == 0 and (rhs.null_count == len(rhs)): @@ -867,28 +872,18 @@ def isin(self, values: Sequence) -> ColumnBase: # typecasting fails return full(len(self), False, dtype="bool") - # If categorical, combine categories first - if is_categorical_dtype(lhs): - lhs_cats = lhs.cat().categories._values - rhs_cats = rhs.cat().categories._values - - if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype): - # If they're not the same dtype, short-circuit if the values - # list doesn't have any nulls. If it does have nulls, make - # the values list a Categorical with a single null - if not rhs.has_nulls: - return full(len(self), False, dtype="bool") - rhs = as_column(pd.Categorical.from_codes([-1], categories=[])) - rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype) - - ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) + res = lhs._obtain_isin_result(rhs) + + return res + + def _obtain_isin_result(self, rhs): + ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) rdf = cudf.DataFrame( {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} ) res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order") res = res.drop_duplicates(subset="orig_order", ignore_index=True) res = res._data["bool"].fillna(False) - return res def as_mask(self) -> Buffer: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1d706687489..c72f8f641c4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -335,6 +335,37 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: def is_unique(self) -> bool: return self.as_numerical.is_unique + def isin(self, values: Sequence) -> ColumnBase: + if cudf.utils.dtypes.is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" + ) + + lhs = self + rhs = None + + try: + rhs = cudf.core.column.as_column(values) + + if rhs.dtype.kind in {"f", "i", "u"}: + return cudf.core.column.full(len(self), False, dtype="bool") + rhs = rhs.astype(self.dtype) + + if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: + return cudf.core.column.full(len(self), False, dtype="bool") + + # Short-circuit if rhs is all null. + if lhs.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(self), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + return res + def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0a8d93c913b..3ff03147583 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -248,6 +248,35 @@ def std( ) -> float: return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) + def isin(self, values: Sequence) -> ColumnBase: + if cudf.utils.dtypes.is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" + ) + + lhs = self + rhs = None + + try: + rhs = as_column(values, nan_as_null=False) + if isinstance(rhs, NumericalColumn): + rhs = rhs.astype(dtype=self.dtype) + + if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: + return cudf.core.column.full(len(self), False, dtype="bool") + + # Short-circuit if rhs is all null. + if lhs.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(self), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + return res + def sum_of_squares(self, dtype: Dtype = None) -> float: return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 17222f16673..7107b66d26c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -367,6 +367,41 @@ def median(self, skipna: bool = None) -> pd.Timedelta: self.as_numerical.median(skipna=skipna), unit=self.time_unit ) + def isin(self, values: Sequence) -> ColumnBase: + if cudf.utils.dtypes.is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" + ) + + lhs = self + rhs = None + + try: + # We need to convert values to same type as self, + # hence passing dtype=self.dtype + rhs = cudf.core.column.as_column(values) + + if rhs.dtype.kind in {"f", "i", "u"}: + return cudf.core.column.full(len(self), False, dtype="bool") + + rhs = rhs.astype(self.dtype) + + if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: + return cudf.core.column.full(len(self), False, dtype="bool") + + # Short-circuit if rhs is all null. + if lhs.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(self), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + + return res + def quantile( self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> "column.ColumnBase": diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 23a950bb72d..31d502e4a23 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1838,7 +1838,6 @@ def test_csv_reader_timedetla_dtypes(dtype): assert_eq(expected, actual) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6719") @pytest.mark.parametrize( "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a93796e46f7..96e77bd6823 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1502,7 +1502,6 @@ def test_dataframe_cupy_array_wrong_index(): gd.DataFrame(d_ary, index="a") -@pytest.mark.xfail(reason="constructor does not coerce index inputs") def test_index_in_dataframe_constructor(): a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) b = gd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) @@ -3990,10 +3989,12 @@ def test_value_counts(): def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) psr = pd.Series(data, index=index) - gsr = gd.Series.from_pandas(psr) + gsr = gd.Series.from_pandas(psr, nan_as_null=False) - got = gsr.isin(values) expected = psr.isin(values) + print(expected) + got = gsr.isin(values) + assert_eq(got, expected) @@ -4066,15 +4067,7 @@ def test_isin_datetime(data, values): ["this", "is"], [None, None, None], ["12", "14", "19"], - pytest.param( - [12, 14, 19], - marks=[ - pytest.mark.xfail( - reason="pandas's failure here seems like a bug " - "given the reverse succeeds" - ) - ], - ), + [12, 14, 19], ["is", "this", "is", "this", "is"], ], ) @@ -8094,9 +8087,8 @@ def test_agg_for_dataframes(data, aggs): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) - expect = pdf.agg(aggs) - got = gdf.agg(aggs) - + expect = pdf.agg(aggs).sort_index() + got = gdf.agg(aggs).sort_index() assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index cffe640d1f9..7e545022eb8 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -413,9 +413,6 @@ def test_datetime_to_arrow(dtype): def test_datetime_unique(data, nulls): psr = pd.Series(data) - print(data) - print(nulls) - if len(data) > 0: if nulls == "some": p = np.random.randint(0, len(data), 2) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 38d3bb0542d..9e401316e19 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -565,7 +565,7 @@ def test_empty_df_head_tail_index(n): ( pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "h", + "a", None, ), ( diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 5229881df25..b4558cec01f 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -973,7 +973,6 @@ def test_series_setitem_datetime(): assert_eq(psr, gsr) -@pytest.mark.xfail(reason="Pandas will coerce to object datatype here") def test_series_setitem_datetime_coerced(): psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") gsr = cudf.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d99897584ec..d7735f9029f 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -540,10 +540,6 @@ def test_empty_joins(how, left_empty, right_empty): assert len(expected) == len(result) -@pytest.mark.xfail( - reason="left_on/right_on produces undefined results with 0" - "index and is disabled" -) def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame( diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index a2afa9f0a97..17f73121b1c 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -87,9 +87,6 @@ def test_can_cast_safely_mixed_kind(): assert not data.can_cast_safely(to_dtype) -@pytest.mark.xfail( - reason="cuDF null <-> pd.NA compatibility not yet supported" -) def test_to_pandas_nullable_integer(): gsr_not_null = Series([1, 2, 3]) gsr_has_null = Series([1, 2, None]) @@ -98,12 +95,9 @@ def test_to_pandas_nullable_integer(): psr_has_null = pd.Series([1, 2, None], dtype="Int64") assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(), psr_has_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) -@pytest.mark.xfail( - reason="cuDF null <-> pd.NA compatibility not yet supported" -) def test_to_pandas_nullable_bool(): gsr_not_null = Series([True, False, True]) gsr_has_null = Series([True, False, None]) @@ -112,7 +106,7 @@ def test_to_pandas_nullable_bool(): psr_has_null = pd.Series([True, False, None], dtype="boolean") assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(), psr_has_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) def test_can_cast_safely_has_nulls(): diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index 888380bc559..981b0e833a0 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -27,6 +27,7 @@ def test_sqrt_integer(): def math_op_test( dtype, fn, nelem=128, test_df=False, positive_only=False, check_dtype=True ): + np.random.seed(0) randvals = gen_rand(dtype, nelem, positive_only=positive_only) h_series = pd.Series(randvals.astype(dtype)) d_series = cudf.Series(h_series) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 8c09dc91253..16c24d5afaa 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1169,8 +1169,7 @@ def test_timedelta_index_repr(index, expected_repr): def test_mulitIndex_repr(pmi, max_seq_items): pd.set_option("display.max_seq_items", max_seq_items) gmi = cudf.from_pandas(pmi) - print(gmi) - print(pmi) + assert gmi.__repr__() == pmi.__repr__() pd.reset_option("display.max_seq_items") diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 315762c931f..030cbe7977d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -71,9 +71,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( - "dtype", - list(NUMERIC_TYPES + DATETIME_TYPES) - + [pytest.param("str", marks=pytest.mark.xfail())], + "dtype", list(NUMERIC_TYPES + DATETIME_TYPES) + ["str"], ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): @@ -102,7 +100,6 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): ) assert_eq(expect, got) - pass @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index abe641c1943..ba0509b16d4 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) @@ -19,10 +19,7 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value): assert_eq(df, gdf) -# pandas incorrectly adds nulls with dataframes -# but works fine with scalers -@pytest.mark.xfail() -def test_dataframe_setitem_scaler_bool_inconsistency(): +def test_dataframe_setitem_scaler_bool(): df = pd.DataFrame({"a": [1, 2, 3]}) df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) @@ -184,3 +181,15 @@ def test_column_set_equal_length_object_by_mask(): data[bool_col] = replace_data assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500])) + + +def test_categorical_setitem_invalid(): + ps = pd.Series([1, 2, 3], dtype="category") + gs = cudf.Series([1, 2, 3], dtype="category") + + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + ) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 8bab802d89c..7c4cfee3f75 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -140,7 +140,10 @@ def test_series_nsmallest(data, n): sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) - assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last")) + assert_eq( + sr.nsmallest(n, keep="last").sort_index(), + psr.nsmallest(n, keep="last").sort_index(), + ) assert_exceptions_equal( lfunc=psr.nsmallest, diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 85a4b6f70b6..df3d04d36ba 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -420,6 +420,7 @@ def test_timedelta_dataframe_ops(df, op): np.timedelta64(4, "s"), np.timedelta64(456, "D"), np.timedelta64(46, "h"), + # TODO: PREM FIX THIS pytest.param( np.timedelta64("nat"), marks=pytest.mark.xfail( @@ -508,6 +509,7 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), + # TODO: PREM Fix this pytest.param( np.timedelta64("nat"), marks=pytest.mark.xfail( @@ -772,6 +774,7 @@ def test_timedelta_datetime_index_ops_misc( "add", "sub", "truediv", + # TODO: PREM FIX THIS pytest.param( "floordiv", marks=pytest.mark.xfail( From 18d1fb39d3e44ccaa44dcfd5472a0dac66a306e8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 22 Feb 2021 12:32:59 -0800 Subject: [PATCH 04/35] remove redundant code --- python/cudf/cudf/core/column/categorical.py | 33 +++--------------- python/cudf/cudf/core/column/column.py | 36 +++++++++++++------- python/cudf/cudf/core/column/datetime.py | 10 ++---- python/cudf/cudf/core/column/numerical.py | 37 ++++++--------------- python/cudf/cudf/core/column/timedelta.py | 12 ++----- 5 files changed, 46 insertions(+), 82 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7c4fa42b93a..98c0f1dcbbd 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -9,7 +9,6 @@ Dict, Mapping, Optional, - Sequence, Tuple, Union, cast, @@ -868,34 +867,12 @@ def set_base_data(self, value): else: super().set_base_data(value) - def isin(self, values: Sequence) -> ColumnBase: - if cudf.utils.dtypes.is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - + def _process_values_for_isin(self, values): lhs = self - rhs = None - - try: - # We need to convert values to same type as self, - # hence passing dtype=self.dtype - rhs = cudf.core.column.as_column(values, dtype=self.dtype) - - if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: - return cudf.core.column.full(len(self), False, dtype="bool") - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return cudf.core.column.full(len(self), False, dtype="bool") - - res = lhs._obtain_isin_result(rhs) - return res + # We need to convert values to same type as self, + # hence passing dtype=self.dtype + rhs = cudf.core.column.as_column(values, dtype=self.dtype) + return lhs, rhs def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7daf8143338..a8aef7a4d35 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -855,18 +855,10 @@ def isin(self, values: Sequence) -> ColumnBase: rhs = None try: - rhs = as_column(values, nan_as_null=False) - if lhs.null_count == len(lhs): - lhs = lhs.astype(rhs.dtype) - elif rhs.null_count == len(rhs): - rhs = rhs.astype(lhs.dtype) - - if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: - return full(len(self), False, dtype="bool") - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return full(len(self), False, dtype="bool") + lhs, rhs = self._process_values_for_isin(values) + res = lhs._isin_earlystop(rhs) + if res is not None: + return res except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails @@ -876,6 +868,26 @@ def isin(self, values: Sequence) -> ColumnBase: return res + def _process_values_for_isin(self, values): + lhs = self + rhs = as_column(values, nan_as_null=False) + if lhs.null_count == len(lhs): + lhs = lhs.astype(rhs.dtype) + elif rhs.null_count == len(rhs): + rhs = rhs.astype(lhs.dtype) + return lhs, rhs + + def _isin_earlystop(self, rhs): + if self.dtype != rhs.dtype: + if self.null_count and rhs.null_count: + return self.isna() + else: + return cudf.core.column.full(len(self), False, dtype="bool") + elif self.null_count == 0 and (rhs.null_count == len(rhs)): + return cudf.core.column.full(len(self), False, dtype="bool") + else: + return None + def _obtain_isin_result(self, rhs): ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) rdf = cudf.DataFrame( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c72f8f641c4..865094d24b1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -351,13 +351,9 @@ def isin(self, values: Sequence) -> ColumnBase: if rhs.dtype.kind in {"f", "i", "u"}: return cudf.core.column.full(len(self), False, dtype="bool") rhs = rhs.astype(self.dtype) - - if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: - return cudf.core.column.full(len(self), False, dtype="bool") - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") + res = lhs._isin_earlystop(rhs) + if res is not None: + return res except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 3ff03147583..4e07dd531ae 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -248,34 +248,19 @@ def std( ) -> float: return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) - def isin(self, values: Sequence) -> ColumnBase: - if cudf.utils.dtypes.is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - + def _process_values_for_isin(self, values): lhs = self - rhs = None + rhs = as_column(values, nan_as_null=False) - try: - rhs = as_column(values, nan_as_null=False) - if isinstance(rhs, NumericalColumn): - rhs = rhs.astype(dtype=self.dtype) - - if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: - return cudf.core.column.full(len(self), False, dtype="bool") - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return cudf.core.column.full(len(self), False, dtype="bool") - - res = lhs._obtain_isin_result(rhs) - return res + if isinstance(rhs, NumericalColumn): + rhs = rhs.astype(dtype=self.dtype) + + if lhs.null_count == len(lhs): + lhs = lhs.astype(rhs.dtype) + elif rhs.null_count == len(rhs): + rhs = rhs.astype(lhs.dtype) + + return lhs, rhs def sum_of_squares(self, dtype: Dtype = None) -> float: return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 7107b66d26c..99032185022 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -378,21 +378,15 @@ def isin(self, values: Sequence) -> ColumnBase: rhs = None try: - # We need to convert values to same type as self, - # hence passing dtype=self.dtype rhs = cudf.core.column.as_column(values) if rhs.dtype.kind in {"f", "i", "u"}: return cudf.core.column.full(len(self), False, dtype="bool") rhs = rhs.astype(self.dtype) - - if not (rhs.null_count == len(rhs)) and lhs.dtype != rhs.dtype: - return cudf.core.column.full(len(self), False, dtype="bool") - - # Short-circuit if rhs is all null. - if lhs.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") + res = lhs._isin_earlystop(rhs) + if res is not None: + return res except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails From 01afeceb2b4b9af57bdd63b9e50e8962b6afb767 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 22 Feb 2021 15:54:51 -0800 Subject: [PATCH 05/35] fix more issues --- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 20 ++++---------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 99032185022..dcffdd4b282 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -127,7 +127,7 @@ def _binary_op_floordiv( common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") if isinstance(rhs, cudf.Scalar): - if rhs.is_valid: + if rhs.is_valid(): rhs = cudf.Scalar( np.timedelta64(rhs.value) .astype(common_dtype) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index df3d04d36ba..90850ff0648 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -420,13 +420,7 @@ def test_timedelta_dataframe_ops(df, op): np.timedelta64(4, "s"), np.timedelta64(456, "D"), np.timedelta64(46, "h"), - # TODO: PREM FIX THIS - pytest.param( - np.timedelta64("nat"), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" - ), - ), + np.timedelta64("nat"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -509,16 +503,11 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), - # TODO: PREM Fix this - pytest.param( - np.timedelta64("nat"), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" - ), - ), + np.timedelta64("nat", "s"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), + np.timedelta64("nat", "ns"), np.timedelta64(1, "ns"), ], ) @@ -774,7 +763,6 @@ def test_timedelta_datetime_index_ops_misc( "add", "sub", "truediv", - # TODO: PREM FIX THIS pytest.param( "floordiv", marks=pytest.mark.xfail( @@ -841,7 +829,7 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): pytest.param( "floordiv", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35529" + reason="https://github.com/rapidsai/cudf/issues/5938" ), ), ], From c7c47b5d18e37ced9df9e1a0cba3d8a54d279057 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 22 Feb 2021 20:00:19 -0800 Subject: [PATCH 06/35] fix lots of deprecated warnings --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/multiindex.py | 24 ++++- python/cudf/cudf/tests/test_categorical.py | 4 +- python/cudf/cudf/tests/test_concat.py | 4 +- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 94 +++++++++++-------- python/cudf/cudf/tests/test_datetime.py | 8 +- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 6 +- python/cudf/cudf/tests/test_index.py | 4 +- python/cudf/cudf/tests/test_repr.py | 5 +- python/cudf/cudf/tests/test_rolling.py | 12 +-- python/cudf/cudf/tests/test_series.py | 13 ++- python/cudf/cudf/tests/test_stats.py | 6 +- python/cudf/cudf/utils/dtypes.py | 7 ++ python/dask_cudf/dask_cudf/tests/test_core.py | 6 +- 17 files changed, 125 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3e7e6625abe..697524dddd8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -690,7 +690,7 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - mask = pd.Series(mask) + mask = pd.Series(mask, dtype=None if len(mask) else "float64") if mask.dtype == "bool": return self._apply_boolean_mask(mask) else: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4e82a1f72b0..f252ed1a9aa 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -15,6 +15,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import column from cudf.core.frame import Frame from cudf.core.index import Index, as_index @@ -485,7 +486,28 @@ def __repr__(self): ) ) ) - preprocess = preprocess.to_pandas(nullable=True) + + if PANDAS_GE_120: + # TODO: Remove this whole `if` block, + # this is a workaround for the following issue: + # https://github.com/pandas-dev/pandas/issues/39984 + temp_df = preprocess._source_data + + preprocess_pdf = pd.DataFrame() + for col in temp_df.columns: + if temp_df[col].dtype.kind == "f": + preprocess_pdf[col] = temp_df[col].to_pandas( + nullable=False + ) + else: + preprocess_pdf[col] = temp_df[col].to_pandas( + nullable=True + ) + + preprocess_pdf.columns = preprocess.names + preprocess = pd.MultiIndex.from_frame(preprocess_pdf) + else: + preprocess = preprocess.to_pandas(nullable=True) preprocess.values[:] = tuples_list else: preprocess = preprocess.to_pandas(nullable=True) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 2d8130e6cb1..67fd07dfcd8 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -493,7 +493,7 @@ def test_categorical_dataframe_slice_copy(): pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), - pd.Series([]), + pd.Series([], dtype="float64"), ], ) @pytest.mark.parametrize( @@ -526,7 +526,7 @@ def test_categorical_typecast(data, cat_type): pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), - pd.Series([]), + pd.Series([], dtype="float64"), ], ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index f8a7099f1bf..3739e226cc6 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -372,8 +372,8 @@ def test_concat_mixed_input(): [ [pd.Series([1, 2, 3]), pd.DataFrame({"a": [1, 2]})], [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], - [pd.Series([]), pd.DataFrame({"a": []})], - [pd.Series([]), pd.DataFrame({"a": [1, 2]})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], [pd.Series([1, 2, 3.0, 1.2], name="abc"), pd.DataFrame({"a": [1, 2]})], [ pd.Series( diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 31d502e4a23..d972d2ad11c 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1815,7 +1815,7 @@ def test_csv_reader_dtypes(dtype): @pytest.mark.parametrize( - "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "float64", "c": "Int32"}] + "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "Float64", "c": "Int32"}] ) def test_csv_reader_nullable_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 96e77bd6823..ecd1f42f4a7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1281,8 +1281,10 @@ def test_concat_different_column_dataframe(df1_d, df2_d): assert_eq(got, expect, check_dtype=False) -@pytest.mark.parametrize("ser_1", [pd.Series([1, 2, 3]), pd.Series([])]) -@pytest.mark.parametrize("ser_2", [pd.Series([])]) +@pytest.mark.parametrize( + "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] +) +@pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) def test_concat_empty_series(ser_1, ser_2): got = gd.concat([gd.Series(ser_1), gd.Series(ser_2)]) expect = pd.concat([ser_1, ser_2]) @@ -1689,7 +1691,7 @@ def test_series_shape(): def test_series_shape_empty(): - ps = pd.Series() + ps = pd.Series(dtype="float64") cs = gd.Series([]) assert ps.shape == cs.shape @@ -2285,7 +2287,7 @@ def test_series_all_null(num_elements, null_type): data = [null_type] * num_elements # Typecast Pandas because None will return `object` dtype - expect = pd.Series(data).astype("float64") + expect = pd.Series(data, dtype="float64") got = gd.Series(data) assert_eq(expect, got) @@ -3245,7 +3247,7 @@ def test_ndim(): assert pdf.ndim == gdf.ndim assert pdf.x.ndim == gdf.x.ndim - s = pd.Series() + s = pd.Series(dtype="float64") gs = gd.Series() assert s.ndim == gs.ndim @@ -3486,7 +3488,7 @@ def test_as_column_types(): col = column.as_column(gd.Series([])) assert_eq(col.dtype, np.dtype("float64")) gds = gd.Series(col) - pds = pd.Series(pd.Series([])) + pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) @@ -3521,7 +3523,7 @@ def test_as_column_types(): assert_eq(pds, gds) - pds = pd.Series([]) + pds = pd.Series([], dtype="float64") gds = gd.Series(column.as_column(pds)) assert_eq(pds, gds) @@ -3857,7 +3859,7 @@ def test_create_dataframe_column(): ], ) def test_series_values_host_property(data): - pds = pd.Series(data) + pds = pd.Series(data, dtype=None if len(data) else "float64") gds = gd.Series(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -3880,7 +3882,7 @@ def test_series_values_host_property(data): ], ) def test_series_values_property(data): - pds = pd.Series(data) + pds = pd.Series(data, dtype=None if len(data) else "float64") gds = gd.Series(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) @@ -3988,11 +3990,10 @@ def test_value_counts(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = pd.Series(data, index=index) + psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") gsr = gd.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) - print(expected) got = gsr.isin(values) assert_eq(got, expected) @@ -4043,7 +4044,7 @@ def test_isin_numeric(data, values): ], ) def test_isin_datetime(data, values): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") gsr = gd.Series.from_pandas(psr) got = gsr.isin(values) @@ -4072,7 +4073,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = gd.Series.from_pandas(psr) got = gsr.isin(values) @@ -4101,7 +4102,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = gd.Series.from_pandas(psr) got = gsr.isin(values) @@ -4135,7 +4136,7 @@ def test_isin_categorical(data, values): ], ) def test_isin_index(data, values): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = gd.Series.from_pandas(psr) got = gsr.index.isin(values) @@ -6671,10 +6672,10 @@ def test_dataframe_keys(df): ["abc", "def", "ghi", "xyz", "pqr", "abc"], index=[1, 2, 3, 4, 5, 10], ), - pd.Series(index=["a", "b", "c", "d", "e", "f"]), - pd.Series(index=[10, 11, 12]), - pd.Series(), - pd.Series([]), + pd.Series(index=["a", "b", "c", "d", "e", "f"], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + pd.Series(dtype="float64"), + pd.Series([], dtype="float64"), ], ) def test_series_keys(ps): @@ -7248,9 +7249,9 @@ def test_dataframe_size(df): @pytest.mark.parametrize( "ps", [ - pd.Series(), - pd.Series(index=[100, 10, 1, 0]), - pd.Series([]), + pd.Series(dtype="float64"), + pd.Series(index=[100, 10, 1, 0], dtype="float64"), + pd.Series([], dtype="float64"), pd.Series(["a", "b", "c", "d"]), pd.Series(["a", "b", "c", "d"], index=[0, 1, 10, 11]), ], @@ -7292,13 +7293,16 @@ def test_dataframe_init_with_columns(data, columns): "data, ignore_dtype", [ ([pd.Series([1, 2, 3])], False), - ([pd.Series(index=[1, 2, 3])], False), - ([pd.Series(name="empty series name")], False), - ([pd.Series([1]), pd.Series([]), pd.Series([3])], False), + ([pd.Series(index=[1, 2, 3], dtype="float64")], False), + ([pd.Series(name="empty series name", dtype="float64")], False), + ( + [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], + False, + ), ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([]), + pd.Series([], dtype="float64"), pd.Series([3], name="series that is named"), ], False, @@ -7315,16 +7319,16 @@ def test_dataframe_init_with_columns(data, columns): ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([]), - pd.Series(index=[10, 11, 12]), + pd.Series([], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), ], False, ), ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc"), - pd.Series(index=[10, 11, 12]), + pd.Series([], name="abc", dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), ], False, ), @@ -7357,17 +7361,21 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns): "data, ignore_dtype, index", [ ([pd.Series([1, 2, 3])], False, ["a", "b", "c"]), - ([pd.Series(index=[1, 2, 3])], False, ["a", "b"]), - ([pd.Series(name="empty series name")], False, ["index1"]), + ([pd.Series(index=[1, 2, 3], dtype="float64")], False, ["a", "b"]), ( - [pd.Series([1]), pd.Series([]), pd.Series([3])], + [pd.Series(name="empty series name", dtype="float64")], + False, + ["index1"], + ), + ( + [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], False, ["0", "2", "1"], ), ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([]), + pd.Series([], dtype="float64"), pd.Series([3], name="series that is named"), ], False, @@ -7390,8 +7398,8 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns): ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([]), - pd.Series(index=[10, 11, 12]), + pd.Series([], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), ], False, ["a", "b", "c"], @@ -7399,8 +7407,8 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns): ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc"), - pd.Series(index=[10, 11, 12]), + pd.Series([], name="abc", dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), ], False, ["a", "v", "z"], @@ -7440,7 +7448,7 @@ def test_dataframe_init_from_series_list_with_index( ( [ pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([]), + pd.Series([], dtype="float64"), pd.Series([3], name="series that is named"), ], ["_", "+"], @@ -7864,6 +7872,10 @@ def test_dataframe_error_equality(df1, df2, op): ], dtype="object", ), + "c": gd.Series( + [0.1, None, 0.2, None, 3, 4, 1000, None], + dtype="float64", + ), } ), pd.DataFrame( @@ -7885,6 +7897,10 @@ def test_dataframe_error_equality(df1, df2, op): ], dtype=pd.StringDtype(), ), + "c": pd.Series( + [0.1, None, 0.2, None, 3, 4, 1000, None], + dtype=pd.Float64Dtype(), + ), } ), ), diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7e545022eb8..b59f76bd8bf 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -411,7 +411,7 @@ def test_datetime_to_arrow(dtype): "nulls", ["none", pytest.param("some", marks=pytest.mark.xfail)] ) def test_datetime_unique(data, nulls): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") if len(data) > 0: if nulls == "some": @@ -435,7 +435,7 @@ def test_datetime_unique(data, nulls): ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_nunique(data, nulls): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") if len(data) > 0: if nulls == "some": @@ -537,7 +537,7 @@ def test_datetime_dataframe(): [ None, [], - pd.Series([]), + pd.Series([], dtype="float64"), pd.Index([]), pd.Series([1, 2, 3]), pd.Series([0, 1, -1]), @@ -670,7 +670,7 @@ def test_to_datetime_not_implemented(): [ 1, [], - pd.Series([]), + pd.Series([], dtype="float64"), pd.Index([]), pd.Series([1, 2, 3]), pd.Series([1, 2.4, 3]), diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index ddd569acf0d..0363534cdd5 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 29f1c31a1ee..9331fe5900f 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -56,7 +56,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = Series(data) + pds = Series(data, dtype=None if len(data) else "float64") gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 294443500a9..bc3af8581b5 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1496,7 +1496,8 @@ def test_groupby_apply_return_series_dataframe(cust_func): @pytest.mark.parametrize( - "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])] + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) @@ -1509,7 +1510,8 @@ def test_groupby_no_keys(pdf): @pytest.mark.parametrize( - "pdf", [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([])] + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 9e401316e19..b59e352ff87 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -960,7 +960,9 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(pd.Series(pd_other)) + expected = pd_data.equals( + pd.Series(pd_other, dtype=None if len(pd_other) else "float64") + ) actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 16c24d5afaa..a440c50d48f 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -158,7 +158,7 @@ def test_integer_dataframe(x): @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x) - ps = pd.Series(x) + ps = pd.Series(x, dtype=None if len(x) else "float64") assert sr.__repr__() == ps.__repr__() @@ -175,7 +175,7 @@ def test_float_dataframe(x): @settings(deadline=None) def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) - ps = pd.Series(x) + ps = pd.Series(x, dtype=None if len(x) else "float64") assert sr.__repr__() == ps.__repr__() @@ -261,6 +261,7 @@ def test_generic_index(length, dtype): psr = pd.Series( range(length), index=np.random.randint(0, high=100, size=length).astype(dtype), + dtype="float64" if length == 0 else None, ) gsr = cudf.Series.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 1ae5bab0da4..794d3be889a 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -37,7 +37,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = pd.Series(data, index=index) + psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -99,13 +99,7 @@ def test_rolling_dataframe_basic(data, agg, nulls, center): pytest.param("min"), pytest.param("max"), pytest.param("mean"), - pytest.param( - "count", # Does not follow similar conventions as - # with non-offset columns - marks=pytest.mark.xfail( - reason="Differs from pandas behaviour here" - ), - ), + pytest.param("count"), ], ) def test_rolling_with_offset(agg): @@ -218,7 +212,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index) + psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a19b88caf4c..3ffd7786057 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -28,7 +28,7 @@ def _series_na_data(): pd.Series([0, 1, 2, 3, 4]), pd.Series(["a", "b", "u", "h", "d"]), pd.Series([None, None, np.nan, None, np.inf, -np.inf]), - pd.Series([]), + pd.Series([], dtype="float64"), pd.Series( [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] ), @@ -383,7 +383,7 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) @@ -490,7 +490,7 @@ def test_series_factorize(data, na_sentinel): @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_series_datetime_value_counts(data, nulls, normalize, dropna): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") if len(data) > 0: if nulls == "one": @@ -733,7 +733,8 @@ def test_series_notnull_notna(ps, nan_as_null): "sr1", [pd.Series([10, 11, 12], index=["a", "b", "z"]), pd.Series(["a"])] ) @pytest.mark.parametrize( - "sr2", [pd.Series([]), pd.Series(["a", "a", "c", "z", "A"])] + "sr2", + [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], ) @pytest.mark.parametrize( "op", @@ -852,6 +853,10 @@ def test_series_memory_usage(): dtype=pd.StringDtype(), ), ), + ( + cudf.Series([1, 2, None, 10.2, None], dtype="float32",), + pd.Series([1, 2, None, 10.2, None], dtype=pd.Float32Dtype(),), + ), ], ) def test_series_to_pandas_nullable_dtypes(sr, expected_psr): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index c06fdd4a48e..2f0b51ba377 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -204,7 +204,7 @@ def test_approx_quantiles_int(): @pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]]) def test_misc_quantiles(data, q): - pdf_series = pd.Series(data) + pdf_series = pd.Series(data, dtype=None if len(data) else "float64") gdf_series = Series(data) expected = pdf_series.quantile(q) @@ -434,13 +434,13 @@ def test_df_corr(): ) @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = Series(data) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = pd.Series(data) + psr = pd.Series(data, dtype=None if len(data) else "float64") gsr = Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 274285990a6..20c86b2a4b7 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,6 +14,7 @@ import cudf from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar +from cudf.core._compat import PANDAS_GE_120 _NA_REP = "" _np_pa_dtypes = { @@ -73,6 +74,12 @@ pd.StringDtype(): np.dtype("object"), } +if PANDAS_GE_120: + cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() + cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() + pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32") + pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64") + SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 10719794843..548aca53fd5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -10,10 +10,10 @@ from dask.dataframe.core import make_meta, meta_nonempty from dask.utils import M -import cudf - import dask_cudf as dgd +import cudf + def test_from_cudf(): np.random.seed(0) @@ -658,7 +658,7 @@ def test_make_meta_backends(index): @pytest.mark.parametrize( "data", [ - pd.Series([]), + pd.Series([], dtype="float64"), pd.DataFrame({"abc": [], "xyz": []}), pd.Series([1, 2, 10, 11]), pd.DataFrame({"abc": [1, 2, 10, 11], "xyz": [100, 12, 120, 1]}), From aea33134d5fde46bb0cbeffde7bfb382b63241b3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 10:11:18 -0800 Subject: [PATCH 07/35] fix multiple warnings --- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/frame.py | 10 +- python/cudf/cudf/testing/testing.py | 230 +++++++++++++----- python/cudf/cudf/tests/test_dataframe.py | 10 +- python/cudf/cudf/tests/test_hdf.py | 16 +- python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 8 +- .../dask_cudf/tests/test_reductions.py | 8 +- 8 files changed, 194 insertions(+), 94 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 697524dddd8..7912a20f740 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -585,7 +585,9 @@ def deserialize(cls, header, frames): def dtypes(self): """Return the dtypes in this object.""" return pd.Series( - [x.dtype for x in self._data.columns], index=self._data.names + [x.dtype for x in self._data.columns], + index=self._data.names, + dtype=None if len(self._data.names) else "float64", ) @property diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e763a164003..cf956ec2654 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,7 +3,6 @@ import copy import functools -import operator import warnings from collections import OrderedDict, abc as abc from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload @@ -27,7 +26,6 @@ min_scalar_type, ) - T = TypeVar("T", bound="Frame") if TYPE_CHECKING: @@ -340,9 +338,11 @@ def _concat( np.intersect1d, all_columns_list ) # get column names not present in all objs - non_intersecting_columns = ( - functools.reduce(operator.or_, (obj.columns for obj in objs)) - ^ intersecting_columns + union_of_columns = objs[0].columns + for obj in objs[1:]: + union_of_columns = union_of_columns.union(obj.columns) + non_intersecting_columns = union_of_columns.symmetric_difference( + intersecting_columns ) names = OrderedDict.fromkeys(intersecting_columns).keys() diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 2048e574acc..2e49ee0bc43 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -8,6 +8,7 @@ import pandas as pd import cudf +from cudf.core._compat import PANDAS_GE_110 from cudf.utils.dtypes import is_categorical_dtype @@ -91,6 +92,8 @@ def assert_column_equal( check_datetimelike_compat=False, check_categorical=True, check_category_order=True, + rtol=1e-05, + atol=1e-08, obj="ColumnBase", ): """ @@ -122,6 +125,10 @@ def assert_column_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘ColumnBase’ Specify object name being compared, internally used to show appropriate assertion message. @@ -165,6 +172,8 @@ def assert_column_equal( exact=check_dtype, check_exact=True, check_categorical=False, + rtol=rtol, + atol=atol, ) assert_column_equal( left.codes, @@ -173,6 +182,8 @@ def assert_column_equal( check_exact=True, check_categorical=False, check_category_order=False, + rtol=rtol, + atol=atol, ) if left.ordered != right.ordered: @@ -220,6 +231,8 @@ def assert_index_equal( check_less_precise: Union[bool, int] = False, check_exact: bool = True, check_categorical: bool = True, + rtol: float = 1e-5, + atol: float = 1e-8, obj: str = "Index", ): """ @@ -247,6 +260,10 @@ def assert_index_equal( Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘Index’ Specify object name being compared, internally used to show appropriate assertion message. @@ -304,15 +321,27 @@ def assert_index_equal( llevel = cudf.Index(left._columns[level], name=left.names[level]) rlevel = cudf.Index(right._columns[level], name=right.names[level]) mul_obj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=check_exact, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - obj=mul_obj, - ) + if PANDAS_GE_110: + assert_index_equal( + llevel, + rlevel, + exact=check_exact, + check_names=check_names, + check_exact=check_exact, + rtol=rtol, + atol=atol, + obj=mul_obj, + ) + else: + assert_index_equal( + llevel, + rlevel, + exact=check_exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=mul_obj, + ) return assert_column_equal( @@ -343,6 +372,8 @@ def assert_series_equal( check_datetimelike_compat=False, check_categorical=True, check_category_order=True, + rtol=1e-5, + atol=1e-8, obj="Series", ): """ @@ -380,6 +411,10 @@ def assert_series_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘Series’ Specify object name being compared, internally used to show appropriate assertion message. @@ -423,28 +458,55 @@ def assert_series_equal( raise_assert_detail(obj, "Series length are different", msg1, msg2) # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) + if PANDAS_GE_110: + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + else: + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) - assert_column_equal( - left._column, - right._column, - check_dtype=check_dtype, - check_column_type=check_series_type, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) + if PANDAS_GE_110: + assert_column_equal( + left._column, + right._column, + check_dtype=check_dtype, + check_column_type=check_series_type, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + check_category_order=check_category_order, + rtol=rtol, + atol=atol, + ) + else: + assert_column_equal( + left._column, + right._column, + check_dtype=check_dtype, + check_column_type=check_series_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + check_category_order=check_category_order, + ) # metadata comparison if check_names and (left.name != right.name): @@ -460,13 +522,14 @@ def assert_frame_equal( check_index_type="equiv", check_column_type="equiv", check_frame_type=True, - check_less_precise=False, - by_blocks=False, check_names=True, + by_blocks=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_like=False, + rtol=1e-5, + atol=1e-8, obj="DataFrame", ): """ @@ -493,8 +556,6 @@ def assert_frame_equal( and similar to pandas. check_frame_type : bool, default True Whether to check the DataFrame class is identical. - check_less_precise : bool or int, default False - Not yet supported check_names : bool, default True Whether to check that the names attribute for both the index and column attributes of the DataFrame is identical. @@ -512,6 +573,10 @@ def assert_frame_equal( If True, ignore the order of index & columns. Note: index labels must match their respective rows (same as in columns) - same labels must be with the same data. + rtol : float, default 1e-5 + Relative tolerance. Only used when `check_exact` is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when `check_exact` is False. obj : str, default ‘DataFrame’ Specify object name being compared, internally used to show appropriate assertion message. @@ -568,40 +633,73 @@ def assert_frame_equal( left, right = left.reindex(index=right.index), right right = right[list(left._data.names)] - if check_less_precise: - raise NotImplementedError("check_less_precise is not yet supported") - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) - - pd.testing.assert_index_equal( - left.columns, - right.columns, - exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.columns", - ) + if PANDAS_GE_110: + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + else: + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) - for col in left.columns: - assert_column_equal( - left._data[col], - right._data[col], - check_dtype=check_dtype, - check_less_precise=check_less_precise, + if PANDAS_GE_110: + pd.testing.assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.columns", + ) + else: + pd.testing.assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, - obj=f'Column name="{col}"', + obj=f"{obj}.columns", ) + + for col in left.columns: + if PANDAS_GE_110: + assert_column_equal( + left._data[col], + right._data[col], + check_dtype=check_dtype, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f'Column name="{col}"', + ) + else: + assert_column_equal( + left._data[col], + right._data[col], + check_dtype=check_dtype, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + obj=f'Column name="{col}"', + ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ecd1f42f4a7..64fc6fb1ab9 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -792,7 +792,7 @@ def test_dataframe_to_string(): def test_dataframe_to_string_wide(monkeypatch): - monkeypatch.setenv("COLUMNS", 79) + monkeypatch.setenv("COLUMNS", "79") # Test basic df = gd.DataFrame() for i in range(100): @@ -3340,7 +3340,9 @@ def test_all(data): # Pandas treats `None` in object type columns as True for some reason, so # replacing with `False` if np.array(data).ndim <= 1: - pdata = pd.Series(data).replace([None], False) + pdata = pd.Series( + data, dtype=None if len(data) else "float64" + ).replace([None], False) gdata = gd.Series.from_pandas(pdata) else: pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False) @@ -3393,7 +3395,7 @@ def test_all(data): @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): if np.array(data).ndim <= 1: - pdata = pd.Series(data) + pdata = pd.Series(data, dtype=None if len(data) else "float64") gdata = gd.Series.from_pandas(pdata) if axis == 1: @@ -4591,7 +4593,7 @@ def test_rowwise_ops(data, op, skipna): expected = getattr(pdf, op)(axis=1, skipna=skipna) got = getattr(gdf, op)(axis=1, skipna=skipna) - assert_eq(expected, got, check_less_precise=7) + assert_eq(expected, got, check_exact=False) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index d5b18a08281..f908d5f51f5 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import os from string import ascii_letters @@ -14,7 +14,7 @@ import tables # noqa F401 except ImportError: pytest.skip( - "PyTables is not installed and is required for HDF " "reading/writing", + "PyTables is not installed and is required for HDF reading/writing", allow_module_level=True, ) @@ -34,7 +34,7 @@ def pdf(request): nrows=nrows, ncols=ncols, data_gen_f=lambda r, c: r, r_idx_type="i" ) # Delete the name of the column index, and rename the row index - del test_pdf.columns.name + test_pdf.columns.name = None test_pdf.index.name = "test_index" # Cast all the column dtypes to objects, rename them, and then cast to @@ -94,14 +94,16 @@ def test_hdf_reader(hdf_files, columns): expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) - assert_eq(expect_df, got_df, check_categorical=False) + assert_eq( + expect_df, got_df, check_categorical=False, check_index_type=False + ) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) got_series = cudf.read_hdf(hdf_series[column]) - assert_eq(expect_series, got_series) + assert_eq(expect_series, got_series, check_index_type=False) @pytest.mark.parametrize("format", ["fixed", "table"]) @@ -130,7 +132,7 @@ def test_hdf_writer(tmpdir, pdf, gdf, complib, format): expect = pd.read_hdf(pdf_df_fname) got = pd.read_hdf(gdf_df_fname) - assert_eq(expect, got) + assert_eq(expect, got, check_index_type=False) for column in pdf.columns: pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.hdf") @@ -149,4 +151,4 @@ def test_hdf_writer(tmpdir, pdf, gdf, complib, format): expect_series = pd.read_hdf(pdf_series_fname) got_series = pd.read_hdf(gdf_series_fname) - assert_eq(expect_series, got_series) + assert_eq(expect_series, got_series, check_index_type=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 548aca53fd5..ba1fb1882b1 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -717,7 +717,7 @@ def test_dataframe_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + dd.assert_eq(ddf.describe(), pddf.describe(), check_exact=False) def test_index_map_partitions(): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 42ca4702987..22ba604d682 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -125,12 +125,8 @@ def test_groupby_std(func): @pytest.mark.parametrize( "func", [ - pytest.param( - lambda df: df.groupby(["a", "b"]).x.sum(), marks=pytest.mark.xfail - ), - pytest.param( - lambda df: df.groupby(["a", "b"]).sum(), marks=pytest.mark.xfail - ), + pytest.param(lambda df: df.groupby(["a", "b"]).x.sum()), + pytest.param(lambda df: df.groupby(["a", "b"]).sum()), pytest.param( lambda df: df.groupby(["a", "b"]).agg({"x", "sum"}), marks=pytest.mark.xfail, diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 61a7ae8af1c..b9f5df6e96f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -2,12 +2,12 @@ import pandas as pd import pytest -import dask.dataframe as dd - -import cudf as gd +from dask import dataframe as dd import dask_cudf as dgd +import cudf as gd + def _make_random_frame(nelem, npartitions=2): df = pd.DataFrame( @@ -75,4 +75,4 @@ def test_rowwise_reductions(data, op): expected = getattr(pddf, op)(axis=1) got = getattr(pddf, op)(axis=1) - dd.assert_eq(expected.compute(), got.compute(), check_less_precise=7) + dd.assert_eq(expected.compute(), got.compute(), check_exact=False) From 9fdbfe79c2cbdb5cc7f40a038966dc7cb41e42cd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 10:15:32 -0800 Subject: [PATCH 08/35] unpin pandas --- conda/recipes/cudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index c5f7bd34c25..bf6519bfa4e 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -35,7 +35,7 @@ requirements: - protobuf - python - typing_extensions - - pandas >=1.0,<1.2.0dev0 + - pandas >=1.0 - cupy >7.1.0,<9.0.0a0 - numba >=0.49.0 - numpy From 27a782b96cb06f7f51db7adf07eab20d1a33d66e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 11:19:08 -0800 Subject: [PATCH 09/35] cleanup --- conda/environments/cudf_dev_cuda10.1.yml | 2 +- conda/environments/cudf_dev_cuda10.2.yml | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 2 +- python/cudf/cudf/core/column/categorical.py | 5 ++++- python/cudf/cudf/core/column/column.py | 21 +++++++++++++++++---- python/cudf/cudf/core/column/numerical.py | 10 ++++++---- 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 69d729aea0c..d8655bea3aa 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49.0,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 68c2ffc6aee..f6113921323 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 4070802e8a8..20481eea580 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0,<1.2.0dev0 + - pandas>=1.0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 98c0f1dcbbd..99da0f9970c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -9,6 +9,7 @@ Dict, Mapping, Optional, + Sequence, Tuple, Union, cast, @@ -867,7 +868,9 @@ def set_base_data(self, value): else: super().set_base_data(value) - def _process_values_for_isin(self, values): + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: lhs = self # We need to convert values to same type as self, # hence passing dtype=self.dtype diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c5b3965d98e..a47174a9f20 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -49,12 +49,12 @@ get_time_unit, is_categorical_dtype, is_decimal_dtype, + is_interval_dtype, is_list_dtype, is_numerical_dtype, is_scalar, is_string_dtype, is_struct_dtype, - is_interval_dtype, min_signed_type, min_unsigned_type, np_to_pa_dtype, @@ -876,7 +876,12 @@ def isin(self, values: Sequence) -> ColumnBase: return res - def _process_values_for_isin(self, values): + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: + """ + Helper function for `isin` which pre-process `values` based on `self`. + """ lhs = self rhs = as_column(values, nan_as_null=False) if lhs.null_count == len(lhs): @@ -885,7 +890,11 @@ def _process_values_for_isin(self, values): rhs = rhs.astype(lhs.dtype) return lhs, rhs - def _isin_earlystop(self, rhs): + def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: + """ + Helper function for `isin` which determines possibility of + early-stopping or not. + """ if self.dtype != rhs.dtype: if self.null_count and rhs.null_count: return self.isna() @@ -896,7 +905,11 @@ def _isin_earlystop(self, rhs): else: return None - def _obtain_isin_result(self, rhs): + def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: + """ + Helper function for `isin` which merges `self` & `rhs` + to determine what values of `rhs` exist in `self`. + """ ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) rdf = cudf.DataFrame( {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4e07dd531ae..711815ba687 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations from numbers import Number -from typing import Any, Callable, Sequence, Union, cast +from typing import Any, Callable, Sequence, Tuple, Union, cast import numpy as np import pandas as pd @@ -248,15 +248,17 @@ def std( ) -> float: return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) - def _process_values_for_isin(self, values): - lhs = self + def _process_values_for_isin( + self, values: Sequence + ) -> Tuple[ColumnBase, ColumnBase]: + lhs = cast("cudf.core.column.ColumnBase", self) rhs = as_column(values, nan_as_null=False) if isinstance(rhs, NumericalColumn): rhs = rhs.astype(dtype=self.dtype) if lhs.null_count == len(lhs): - lhs = lhs.astype(rhs.dtype) + lhs = cast("cudf.core.column.ColumnBase", lhs.astype(rhs.dtype)) elif rhs.null_count == len(rhs): rhs = rhs.astype(lhs.dtype) From 3cde2efd63273cf7c075ddd8e17dce093122fe5c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 11:19:52 -0800 Subject: [PATCH 10/35] cleanup --- python/cudf/cudf/core/column/numerical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 711815ba687..f9b695e9ce3 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -258,7 +258,7 @@ def _process_values_for_isin( rhs = rhs.astype(dtype=self.dtype) if lhs.null_count == len(lhs): - lhs = cast("cudf.core.column.ColumnBase", lhs.astype(rhs.dtype)) + lhs = lhs.astype(rhs.dtype) elif rhs.null_count == len(rhs): rhs = rhs.astype(lhs.dtype) From 9a3b51ac13c5e0a999ba6de2d77149e774e43ac3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 11:33:29 -0800 Subject: [PATCH 11/35] copyright --- conda/recipes/cudf/meta.yaml | 2 +- python/cudf/cudf/core/_compat.py | 2 +- python/cudf/cudf/core/column/column.py | 1 + python/cudf/cudf/core/column/datetime.py | 1 + python/cudf/cudf/core/column/timedelta.py | 1 + python/cudf/cudf/core/frame.py | 1 + python/cudf/cudf/core/multiindex.py | 3 ++- python/cudf/cudf/testing/testing.py | 2 +- python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_concat.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 3 ++- python/cudf/cudf/tests/test_dropna.py | 3 ++- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 2 ++ python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_numerical.py | 2 ++ python/cudf/cudf/tests/test_ops.py | 2 ++ python/cudf/cudf/tests/test_reductions.py | 2 +- python/cudf/cudf/tests/test_repr.py | 3 ++- python/cudf/cudf/tests/test_reshape.py | 2 ++ python/cudf/cudf/tests/test_rolling.py | 2 ++ python/cudf/cudf/tests/test_series.py | 1 + python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/tests/test_string.py | 3 ++- python/cudf/cudf/tests/test_timedelta.py | 1 + python/dask_cudf/dask_cudf/tests/test_core.py | 2 ++ python/dask_cudf/dask_cudf/tests/test_groupby.py | 2 ++ python/dask_cudf/dask_cudf/tests/test_reductions.py | 2 ++ 32 files changed, 45 insertions(+), 18 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index bf6519bfa4e..58a1c2d4e8c 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index de19acf9ba4..e8b0259c142 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd from packaging import version diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a47174a9f20..28dd521b37c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,5 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. + from __future__ import annotations import builtins diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 865094d24b1..d32b3c2f8e2 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2021, NVIDIA CORPORATION. + from __future__ import annotations import datetime as dt diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index dcffdd4b282..82ce1f5f7a0 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + from __future__ import annotations import datetime as dt diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cf956ec2654..e8858936e83 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + from __future__ import annotations import copy diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f252ed1a9aa..19c5b827d50 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import itertools import numbers import pickle diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 2e49ee0bc43..2f9a78aab78 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 67fd07dfcd8..d4dca164992 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import operator diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3739e226cc6..d0e31a82b28 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import re diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index b59f76bd8bf..1d313c9f464 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import datetime import datetime as dt import operator diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 0363534cdd5..92e70543cbe 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 9331fe5900f..b4a45ed001b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import numpy as np import pytest diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index bc3af8581b5..3542a5af537 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import itertools diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b59e352ff87..adb6bb33763 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. """ Test related to Index diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index b4558cec01f..6e33b1421c8 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + from itertools import combinations import cupy diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index fe365f4e120..791598110df 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import copy import itertools diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 17f73121b1c..a70dd7f4024 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index 981b0e833a0..8cdef19d9ba 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 323f8c62892..80a2e89bf46 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import division, print_function diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index a440c50d48f..1dd3a5c1c8c 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + import textwrap import cupy as cp diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 030cbe7977d..5e90c2348e4 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import re import numpy as np diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 794d3be889a..c701e863c35 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import math import numpy as np diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3ffd7786057..b6210be62f3 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + import operator import re from string import ascii_letters, digits diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index ba0509b16d4..2d4791f541c 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import numpy as np import pandas as pd diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 7c4cfee3f75..e30194e9eda 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from itertools import product diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 2f0b51ba377..e8483e44462 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import re diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 997249e3140..f98f897ef72 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + import re from contextlib import ExitStack as does_not_raise from sys import getsizeof diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 90850ff0648..d55bc533ba8 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. + import datetime import operator import re diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index ba1fb1882b1..aebdb9fe5b9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import random import cupy as cp diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 22ba604d682..2bb80b85568 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index b9f5df6e96f..4da81e4f86c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest From 7a534b072c034a292232b96be8b443893f7c6d30 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 16:19:37 -0800 Subject: [PATCH 12/35] pin pandas upper bound version --- conda/environments/cudf_dev_cuda10.1.yml | 2 +- conda/environments/cudf_dev_cuda10.2.yml | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index d8655bea3aa..993f64261d4 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49.0,!=0.51.0 - numpy - - pandas>=1.0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index f6113921323..e69289c51da 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 20481eea580..aea87ba423c 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.6,<3.8 - numba>=0.49,!=0.51.0 - numpy - - pandas>=1.0 + - pandas>=1.0,<1.3.0dev0 - pyarrow=1.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 58a1c2d4e8c..9afc7094f27 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -35,7 +35,7 @@ requirements: - protobuf - python - typing_extensions - - pandas >=1.0 + - pandas >=1.0,<1.3.0dev0 - cupy >7.1.0,<9.0.0a0 - numba >=0.49.0 - numpy From 81d9b5d853337d30ad4b7b25cef4538daccc2b00 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 23 Feb 2021 16:31:31 -0800 Subject: [PATCH 13/35] use only minor version --- python/cudf/cudf/core/_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e8b0259c142..0fedfcabb46 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -6,4 +6,4 @@ PANDAS_VERSION = version.parse(pd.__version__) PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0") PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1") -PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2.0") +PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2") From c5b83a21efc5e51fb550d114c559b5dfdfd0f9c0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 08:33:04 -0800 Subject: [PATCH 14/35] use functools for finding union --- python/cudf/cudf/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e8858936e83..dedefeaf9a2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -339,9 +339,9 @@ def _concat( np.intersect1d, all_columns_list ) # get column names not present in all objs - union_of_columns = objs[0].columns - for obj in objs[1:]: - union_of_columns = union_of_columns.union(obj.columns) + union_of_columns = functools.reduce( + pd.Index.union, [obj.columns for obj in objs] + ) non_intersecting_columns = union_of_columns.symmetric_difference( intersecting_columns ) From 5e6855ddf80ee27b7c395276e1c132988776ffee Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 10:10:19 -0800 Subject: [PATCH 15/35] add utility for creating a pandas series and refactor imports in test_dataframe --- python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/tests/test_dataframe.py | 1388 +++++++++++---------- python/cudf/cudf/tests/test_datetime.py | 8 +- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_repr.py | 4 +- python/cudf/cudf/tests/test_rolling.py | 4 +- python/cudf/cudf/tests/test_series.py | 6 +- python/cudf/cudf/tests/test_stats.py | 7 +- python/cudf/cudf/utils/utils.py | 18 +- 11 files changed, 739 insertions(+), 710 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cea22725002..a22fdf65f9f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -584,10 +584,8 @@ def deserialize(cls, header, frames): @property def dtypes(self): """Return the dtypes in this object.""" - return pd.Series( - [x.dtype for x in self._data.columns], - index=self._data.names, - dtype=None if len(self._data.names) else "float64", + return cudf.utils.utils.create_pandas_series( + data=[x.dtype for x in self._data.columns], index=self._data.names, ) @property @@ -692,7 +690,7 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - mask = pd.Series(mask, dtype=None if len(mask) else "float64") + mask = cudf.utils.utils.create_pandas_series(data=mask) if mask.dtype == "bool": return self._apply_boolean_mask(mask) else: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f95913e5cee..d792c62a247 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,11 +1,14 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import array as arr +import datetime import io import operator import random import re +import string import textwrap +from copy import copy import cupy import numpy as np @@ -14,7 +17,7 @@ import pytest from numba import cuda -import cudf as gd +import cudf from cudf.core._compat import PANDAS_GE_110 from cudf.core.column import column from cudf.tests import utils @@ -37,7 +40,7 @@ def test_init_via_list_of_tuples(): ] pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) assert_eq(pdf, gdf) @@ -70,7 +73,7 @@ def test_init_via_list_of_empty_tuples(rows): data = [()] * rows pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) assert_eq( pdf, @@ -109,15 +112,15 @@ def test_init_via_list_of_empty_tuples(rows): ) def test_init_from_series_align(dict_of_series): pdf = pd.DataFrame(dict_of_series) - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) assert_eq(pdf, gdf) for key in dict_of_series: if isinstance(dict_of_series[key], pd.Series): - dict_of_series[key] = gd.Series(dict_of_series[key]) + dict_of_series[key] = cudf.Series(dict_of_series[key]) - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) assert_eq(pdf, gdf) @@ -145,7 +148,7 @@ def test_init_from_series_align(dict_of_series): ) def test_init_from_series_align_nonunique(dict_of_series, expectation): with expectation: - gdf = gd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) if expectation == does_not_raise(): pdf = pd.DataFrame(dict_of_series) @@ -160,10 +163,10 @@ def test_init_unaligned_with_index(): }, index=[7, 8, 9], ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { - "a": gd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": gd.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), + "a": cudf.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), + "b": cudf.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), }, index=[7, 8, 9], ) @@ -174,7 +177,7 @@ def test_init_unaligned_with_index(): def test_series_basic(): # Make series from buffer a1 = np.arange(10, dtype=np.float64) - series = gd.Series(a1) + series = cudf.Series(a1) assert len(series) == 10 np.testing.assert_equal(series.to_array(), np.hstack([a1])) @@ -183,8 +186,8 @@ def test_series_from_cupy_scalars(): data = [0.1, 0.2, 0.3] data_np = np.array(data) data_cp = cupy.array(data) - s_np = gd.Series([data_np[0], data_np[2]]) - s_cp = gd.Series([data_cp[0], data_cp[2]]) + s_np = cudf.Series([data_np[0], data_np[2]]) + s_cp = cudf.Series([data_cp[0], data_cp[2]]) assert_eq(s_np, s_cp) @@ -196,7 +199,7 @@ def test_append_index(a, b): df["a"] = a df["b"] = b - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = a gdf["b"] = b @@ -218,17 +221,17 @@ def test_series_init_none(): # test for creating empty series # 1: without initializing - sr1 = gd.Series() + sr1 = cudf.Series() got = sr1.to_string() - print(got) + expect = "Series([], dtype: float64)" # values should match despite whitespace difference assert got.split() == expect.split() # 2: Using `None` as an initializer - sr2 = gd.Series(None) + sr2 = cudf.Series(None) got = sr2.to_string() - print(got) + expect = "Series([], dtype: float64)" # values should match despite whitespace difference assert got.split() == expect.split() @@ -236,7 +239,7 @@ def test_series_init_none(): def test_dataframe_basic(): np.random.seed(0) - df = gd.DataFrame() + df = cudf.DataFrame() # Populate with cuda memory df["keys"] = np.arange(10, dtype=np.float64) @@ -251,12 +254,12 @@ def test_dataframe_basic(): assert tuple(df.columns) == ("keys", "vals") # Make another dataframe - df2 = gd.DataFrame() + df2 = cudf.DataFrame() df2["keys"] = np.array([123], dtype=np.float64) df2["vals"] = np.array([321], dtype=np.float64) # Concat - df = gd.concat([df, df2]) + df = cudf.concat([df, df2]) assert len(df) == 11 hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) @@ -270,21 +273,19 @@ def test_dataframe_basic(): expect = np.vstack([hkeys, hvals]).T - print(expect) - print(mat) np.testing.assert_equal(mat, expect) # test dataframe with tuple name - df_tup = gd.DataFrame() + df_tup = cudf.DataFrame() data = np.arange(10) df_tup[(1, "foobar")] = data np.testing.assert_equal(data, df_tup[(1, "foobar")].to_array()) - df = gd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) + df = cudf.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) pdf = pd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) assert_eq(df, pdf) - gdf = gd.DataFrame({"id": [0, 1], "val": [None, None]}) + gdf = cudf.DataFrame({"id": [0, 1], "val": [None, None]}) gdf["val"] = gdf["val"].astype("int") assert gdf["val"].isnull().all() @@ -305,7 +306,7 @@ def test_dataframe_basic(): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_columns(pdf, columns, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(columns=columns, inplace=inplace) actual = gdf.drop(columns=columns, inplace=inplace) @@ -333,7 +334,7 @@ def test_dataframe_drop_columns(pdf, columns, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(labels=labels, axis=0, inplace=inplace) actual = gdf.drop(labels=labels, axis=0, inplace=inplace) @@ -361,7 +362,7 @@ def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_index(pdf, index, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(index=index, inplace=inplace) actual = gdf.drop(index=index, inplace=inplace) @@ -426,7 +427,7 @@ def test_dataframe_drop_index(pdf, index, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_multiindex(pdf, index, level, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(index=index, inplace=inplace, level=level) actual = gdf.drop(index=index, inplace=inplace, level=level) @@ -453,7 +454,7 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): pdf = pdf.copy() - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.drop(labels=labels, axis=1, inplace=inplace) actual = gdf.drop(labels=labels, axis=1, inplace=inplace) @@ -466,7 +467,7 @@ def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): def test_dataframe_drop_error(): - df = gd.DataFrame({"a": [1], "b": [2], "c": [3]}) + df = cudf.DataFrame({"a": [1], "b": [2], "c": [3]}) pdf = df.to_pandas() assert_exceptions_equal( @@ -511,7 +512,7 @@ def test_dataframe_drop_error(): def test_dataframe_drop_raises(): - df = gd.DataFrame( + df = cudf.DataFrame( {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] ) pdf = df.to_pandas() @@ -556,7 +557,7 @@ def test_dataframe_drop_raises(): def test_dataframe_column_add_drop_via_setitem(): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(10)) df["a"] = data df["b"] = data @@ -573,7 +574,7 @@ def test_dataframe_column_set_via_attr(): data_0 = np.asarray([0, 2, 4, 5]) data_1 = np.asarray([1, 4, 2, 3]) data_2 = np.asarray([2, 0, 3, 0]) - df = gd.DataFrame({"a": data_0, "b": data_1, "c": data_2}) + df = cudf.DataFrame({"a": data_0, "b": data_1, "c": data_2}) for i in range(10): df.c = df.a @@ -586,7 +587,7 @@ def test_dataframe_column_set_via_attr(): def test_dataframe_column_drop_via_attr(): - df = gd.DataFrame({"a": []}) + df = cudf.DataFrame({"a": []}) with pytest.raises(AttributeError): del df.a @@ -597,7 +598,7 @@ def test_dataframe_column_drop_via_attr(): @pytest.mark.parametrize("axis", [0, "index"]) def test_dataframe_index_rename(axis): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.rename(mapper={1: 5, 2: 6}, axis=axis) got = gdf.rename(mapper={1: 5, 2: 6}, axis=axis) @@ -621,7 +622,7 @@ def test_dataframe_index_rename(axis): def test_dataframe_MI_rename(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} ) gdg = gdf.groupby(["a", "b"]).count() @@ -636,7 +637,7 @@ def test_dataframe_MI_rename(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_dataframe_column_rename(axis): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.rename(mapper=lambda name: 2 * name, axis=axis) got = gdf.rename(mapper=lambda name: 2 * name, axis=axis) @@ -659,7 +660,7 @@ def test_dataframe_pop(): pdf = pd.DataFrame( {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) # Test non-existing column error with pytest.raises(KeyError) as raises: @@ -686,7 +687,7 @@ def test_dataframe_pop(): # check empty dataframe edge case empty_pdf = pd.DataFrame(columns=["a", "b"]) - empty_gdf = gd.DataFrame(columns=["a", "b"]) + empty_gdf = cudf.DataFrame(columns=["a", "b"]) pb = empty_pdf.pop("b") gb = empty_gdf.pop("b") assert len(pb) == len(gb) @@ -695,7 +696,7 @@ def test_dataframe_pop(): @pytest.mark.parametrize("nelem", [0, 3, 100, 1000]) def test_dataframe_astype(nelem): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data assert df["a"].dtype is np.dtype(np.int32) @@ -706,7 +707,7 @@ def test_dataframe_astype(nelem): @pytest.mark.parametrize("nelem", [0, 100]) def test_index_astype(nelem): - df = gd.DataFrame() + df = cudf.DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data assert df.index.dtype is np.dtype(np.int64) @@ -725,13 +726,15 @@ def test_dataframe_to_string(): pd.options.display.max_rows = 5 pd.options.display.max_columns = 8 # Test basic - df = gd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) string = str(df) - print(string) + assert string.splitlines()[-1] == "[6 rows x 2 columns]" # Test skipped columns - df = gd.DataFrame( + df = cudf.DataFrame( { "a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16], @@ -740,17 +743,19 @@ def test_dataframe_to_string(): } ) string = df.to_string() - print(string) + assert string.splitlines()[-1] == "[6 rows x 4 columns]" # Test masked - df = gd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) data = np.arange(6) - mask = np.zeros(1, dtype=gd.utils.utils.mask_dtype) + mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) mask[0] = 0b00101101 - masked = gd.Series.from_masked_array(data, mask) + masked = cudf.Series.from_masked_array(data, mask) assert masked.null_count == 2 df["c"] = masked @@ -766,11 +771,11 @@ def test_dataframe_to_string(): # null position is correct for i in range(len(values)): if i not in validids: - assert values[i] is gd.NA + assert values[i] is cudf.NA pd.options.display.max_rows = 10 got = df.to_string() - print(got) + expect = """ a b c 0 1 11 0 @@ -787,12 +792,12 @@ def test_dataframe_to_string(): def test_dataframe_to_string_wide(monkeypatch): monkeypatch.setenv("COLUMNS", "79") # Test basic - df = gd.DataFrame() + df = cudf.DataFrame() for i in range(100): df["a{}".format(i)] = list(range(3)) pd.options.display.max_columns = 0 got = df.to_string() - print(got) + expect = """ a0 a1 a2 a3 a4 a5 a6 a7 ... a92 a93 a94 a95 a96 a97 a98 a99 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 @@ -806,9 +811,9 @@ def test_dataframe_to_string_wide(monkeypatch): def test_dataframe_empty_to_string(): # Test for printing empty dataframe - df = gd.DataFrame() + df = cudf.DataFrame() got = df.to_string() - print(got) + expect = "Empty DataFrame\nColumns: []\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split() @@ -816,11 +821,11 @@ def test_dataframe_empty_to_string(): def test_dataframe_emptycolumns_to_string(): # Test for printing dataframe having empty columns - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [] df["b"] = [] got = df.to_string() - print(got) + expect = "Empty DataFrame\nColumns: [a, b]\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split() @@ -828,14 +833,12 @@ def test_dataframe_emptycolumns_to_string(): def test_dataframe_copy(): # Test for copying the dataframe using python copy pkg - from copy import copy - - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [1, 2, 3] df2 = copy(df) df2["b"] = [4, 5, 6] got = df.to_string() - print(got) + expect = """ a 0 1 @@ -848,12 +851,12 @@ def test_dataframe_copy(): def test_dataframe_copy_shallow(): # Test for copy dataframe using class method - df = gd.DataFrame() + df = cudf.DataFrame() df["a"] = [1, 2, 3] df2 = df.copy() df2["b"] = [4, 2, 3] got = df.to_string() - print(got) + expect = """ a 0 1 @@ -868,7 +871,9 @@ def test_dataframe_dtypes(): dtypes = pd.Series( [np.int32, np.float32, np.float64], index=["c", "a", "b"] ) - df = gd.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()}) + df = cudf.DataFrame( + {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()} + ) assert df.dtypes.equals(dtypes) @@ -879,7 +884,7 @@ def test_dataframe_add_col_to_object_dataframe(): data = {k: v for (k, v) in zip(cols, [["a"] for _ in cols])} - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) gdf = gdf[:0] assert gdf.dtypes.equals(df.dtypes) @@ -892,7 +897,7 @@ def test_dataframe_add_col_to_object_dataframe(): def test_dataframe_dir_and_getattr(): - df = gd.DataFrame( + df = cudf.DataFrame( { "a": np.ones(10), "b": np.ones(10), @@ -914,13 +919,13 @@ def test_dataframe_dir_and_getattr(): @pytest.mark.parametrize("order", ["C", "F"]) def test_empty_dataframe_as_gpu_matrix(order): - df = gd.DataFrame() + df = cudf.DataFrame() # Check fully empty dataframe. mat = df.as_gpu_matrix(order=order).copy_to_host() assert mat.shape == (0, 0) - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 for k in "abc": df[k] = np.random.random(nelem) @@ -932,7 +937,7 @@ def test_empty_dataframe_as_gpu_matrix(order): @pytest.mark.parametrize("order", ["C", "F"]) def test_dataframe_as_gpu_matrix(order): - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 for k in "abcd": @@ -953,7 +958,7 @@ def test_dataframe_as_gpu_matrix(order): def test_dataframe_as_gpu_matrix_null_values(): - df = gd.DataFrame() + df = cudf.DataFrame() nelem = 123 na = -10000 @@ -989,7 +994,7 @@ def test_dataframe_append_empty(): "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], } ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) gdf["newcol"] = 100 pdf["newcol"] = 100 @@ -1006,30 +1011,30 @@ def test_dataframe_setitem_from_masked_object(): np.random.shuffle(mask) ary[mask] = np.nan - test1_null = gd.Series(ary, nan_as_null=True) + test1_null = cudf.Series(ary, nan_as_null=True) assert test1_null.nullable assert test1_null.null_count == 20 - test1_nan = gd.Series(ary, nan_as_null=False) + test1_nan = cudf.Series(ary, nan_as_null=False) assert test1_nan.null_count == 0 - test2_null = gd.DataFrame.from_pandas( + test2_null = cudf.DataFrame.from_pandas( pd.DataFrame({"a": ary}), nan_as_null=True ) assert test2_null["a"].nullable assert test2_null["a"].null_count == 20 - test2_nan = gd.DataFrame.from_pandas( + test2_nan = cudf.DataFrame.from_pandas( pd.DataFrame({"a": ary}), nan_as_null=False ) assert test2_nan["a"].null_count == 0 gpu_ary = cupy.asarray(ary) - test3_null = gd.Series(gpu_ary, nan_as_null=True) + test3_null = cudf.Series(gpu_ary, nan_as_null=True) assert test3_null.nullable assert test3_null.null_count == 20 - test3_nan = gd.Series(gpu_ary, nan_as_null=False) + test3_nan = cudf.Series(gpu_ary, nan_as_null=False) assert test3_nan.null_count == 0 - test4 = gd.DataFrame() + test4 = cudf.DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4["lst"] = lst assert test4["lst"].nullable @@ -1041,7 +1046,7 @@ def test_dataframe_append_to_empty(): pdf["a"] = [] pdf["b"] = [1, 2, 3] - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [] gdf["b"] = [1, 2, 3] @@ -1049,7 +1054,7 @@ def test_dataframe_append_to_empty(): def test_dataframe_setitem_index_len1(): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [1] gdf["b"] = gdf.index._values @@ -1057,7 +1062,7 @@ def test_dataframe_setitem_index_len1(): def test_assign(): - gdf = gd.DataFrame({"x": [1, 2, 3]}) + gdf = cudf.DataFrame({"x": [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ["x"] assert list(gdf2.columns) == ["x", "y"] @@ -1067,7 +1072,7 @@ def test_assign(): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) def test_dataframe_hash_columns(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() data = np.asarray(range(nrows)) data[0] = data[-1] # make first and last the same gdf["a"] = data @@ -1085,7 +1090,7 @@ def test_dataframe_hash_columns(nrows): out_one = cupy.asnumpy(gdf.hash_columns(["a"])) # First matches last assert out_one[0] == out_one[-1] - # Equivalent to the gd.Series.hash_values() + # Equivalent to the cudf.Series.hash_values() np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one) @@ -1094,7 +1099,7 @@ def test_dataframe_hash_columns(nrows): @pytest.mark.parametrize("nkeys", [1, 2]) def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() keycols = [] for i in range(nkeys): keyname = "key{}".format(i) @@ -1108,7 +1113,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type - assert all(isinstance(p, gd.DataFrame) for p in got) + assert all(isinstance(p, cudf.DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: @@ -1123,7 +1128,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): @pytest.mark.parametrize("nrows", [3, 10, 50]) def test_dataframe_hash_partition_masked_value(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) @@ -1144,7 +1149,7 @@ def test_dataframe_hash_partition_masked_value(nrows): @pytest.mark.parametrize("nrows", [3, 10, 50]) def test_dataframe_hash_partition_masked_keys(nrows): - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) @@ -1167,14 +1172,14 @@ def test_dataframe_hash_partition_masked_keys(nrows): @pytest.mark.parametrize("keep_index", [True, False]) def test_dataframe_hash_partition_keep_index(keep_index): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"val": [1, 2, 3, 4], "key": [3, 2, 1, 4]}, index=[4, 3, 2, 1] ) - expected_df1 = gd.DataFrame( + expected_df1 = cudf.DataFrame( {"val": [1], "key": [3]}, index=[4] if keep_index else None ) - expected_df2 = gd.DataFrame( + expected_df2 = cudf.DataFrame( {"val": [2, 3, 4], "key": [2, 1, 4]}, index=[3, 2, 1] if keep_index else range(1, 4), ) @@ -1187,7 +1192,7 @@ def test_dataframe_hash_partition_keep_index(keep_index): def test_dataframe_hash_partition_empty(): - gdf = gd.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) + gdf = cudf.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) parts = gdf.iloc[:0].partition_by_hash(["key"], nparts=3) assert len(parts) == 3 for part in parts: @@ -1201,33 +1206,33 @@ def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): df2 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype2))) if dtype1 != dtype2 and "datetime" in dtype1 or "datetime" in dtype2: with pytest.raises(TypeError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) else: pres = pd.concat([df1, df2]) - gres = gd.concat([gd.from_pandas(df1), gd.from_pandas(df2)]) - assert_eq(gd.from_pandas(pres), gres) + gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) + assert_eq(cudf.from_pandas(pres), gres) def test_dataframe_concat_different_column_types(): - df1 = gd.Series([42], dtype=np.float) - df2 = gd.Series(["a"], dtype="category") + df1 = cudf.Series([42], dtype=np.float) + df2 = cudf.Series(["a"], dtype="category") with pytest.raises(ValueError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) - df2 = gd.Series(["a string"]) + df2 = cudf.Series(["a string"]) with pytest.raises(TypeError): - gd.concat([df1, df2]) + cudf.concat([df1, df2]) @pytest.mark.parametrize( - "df_1", [gd.DataFrame({"a": [1, 2], "b": [1, 3]}), gd.DataFrame({})] + "df_1", [cudf.DataFrame({"a": [1, 2], "b": [1, 3]}), cudf.DataFrame({})] ) @pytest.mark.parametrize( - "df_2", [gd.DataFrame({"a": [], "b": []}), gd.DataFrame({})] + "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = gd.concat([df_1, df_2]) + got = cudf.concat([df_1, df_2]) expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) # ignoring dtypes as pandas upcasts int to float @@ -1254,8 +1259,8 @@ def test_concat_empty_dataframe(df_1, df_2): ], ) def test_concat_different_column_dataframe(df1_d, df2_d): - got = gd.concat( - [gd.DataFrame(df1_d), gd.DataFrame(df2_d), gd.DataFrame(df1_d)], + got = cudf.concat( + [cudf.DataFrame(df1_d), cudf.DataFrame(df2_d), cudf.DataFrame(df1_d)], sort=False, ) @@ -1279,7 +1284,7 @@ def test_concat_different_column_dataframe(df1_d, df2_d): ) @pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) def test_concat_empty_series(ser_1, ser_2): - got = gd.concat([gd.Series(ser_1), gd.Series(ser_2)]) + got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) expect = pd.concat([ser_1, ser_2]) assert_eq(got, expect) @@ -1290,49 +1295,49 @@ def test_concat_with_axis(): df2 = pd.DataFrame(dict(a=np.arange(5), b=np.arange(5))) concat_df = pd.concat([df1, df2], axis=1) - cdf1 = gd.from_pandas(df1) - cdf2 = gd.from_pandas(df2) + cdf1 = cudf.from_pandas(df1) + cdf2 = cudf.from_pandas(df2) # concat only dataframes - concat_cdf = gd.concat([cdf1, cdf2], axis=1) + concat_cdf = cudf.concat([cdf1, cdf2], axis=1) assert_eq(concat_cdf, concat_df) # concat only series concat_s = pd.concat([df1.x, df1.y], axis=1) - cs1 = gd.Series.from_pandas(df1.x) - cs2 = gd.Series.from_pandas(df1.y) - concat_cdf_s = gd.concat([cs1, cs2], axis=1) + cs1 = cudf.Series.from_pandas(df1.x) + cs2 = cudf.Series.from_pandas(df1.y) + concat_cdf_s = cudf.concat([cs1, cs2], axis=1) assert_eq(concat_cdf_s, concat_s) # concat series and dataframes s3 = pd.Series(np.random.random(5)) - cs3 = gd.Series.from_pandas(s3) + cs3 = cudf.Series.from_pandas(s3) - concat_cdf_all = gd.concat([cdf1, cs3, cdf2], axis=1) + concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) concat_df_all = pd.concat([df1, s3, df2], axis=1) assert_eq(concat_cdf_all, concat_df_all) # concat manual multi index - midf1 = gd.from_pandas(df1) - midf1.index = gd.MultiIndex( + midf1 = cudf.from_pandas(df1) + midf1.index = cudf.MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], codes=[[0, 1, 2, 3, 2], [0, 1, 0, 1, 0]] ) midf2 = midf1[2:] - midf2.index = gd.MultiIndex( + midf2.index = cudf.MultiIndex( levels=[[3, 4, 5], [2, 0]], codes=[[0, 1, 2], [1, 0, 1]] ) mipdf1 = midf1.to_pandas() mipdf2 = midf2.to_pandas() - assert_eq(gd.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2])) - assert_eq(gd.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1])) + assert_eq(cudf.concat([midf1, midf2]), pd.concat([mipdf1, mipdf2])) + assert_eq(cudf.concat([midf2, midf1]), pd.concat([mipdf2, mipdf1])) assert_eq( - gd.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1]) + cudf.concat([midf1, midf2, midf1]), pd.concat([mipdf1, mipdf2, mipdf1]) ) # concat groupby multi index - gdf1 = gd.DataFrame( + gdf1 = cudf.DataFrame( { "x": np.random.randint(0, 10, 10), "y": np.random.randint(0, 10, 10), @@ -1346,8 +1351,8 @@ def test_concat_with_axis(): pdg1 = gdg1.to_pandas() pdg2 = gdg2.to_pandas() - assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) - assert_eq(gd.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1])) + assert_eq(cudf.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2])) + assert_eq(cudf.concat([gdg2, gdg1]), pd.concat([pdg2, pdg1])) # series multi index concat gdgz1 = gdg1.z @@ -1355,15 +1360,15 @@ def test_concat_with_axis(): pdgz1 = gdgz1.to_pandas() pdgz2 = gdgz2.to_pandas() - assert_eq(gd.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2])) - assert_eq(gd.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1])) + assert_eq(cudf.concat([gdgz1, gdgz2]), pd.concat([pdgz1, pdgz2])) + assert_eq(cudf.concat([gdgz2, gdgz1]), pd.concat([pdgz2, pdgz1])) @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) def test_nonmatching_index_setitem(nrows): np.random.seed(0) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = np.random.randint(2147483647, size=nrows) gdf["b"] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index("b") @@ -1374,20 +1379,20 @@ def test_nonmatching_index_setitem(nrows): assert ( gdf["c"] .to_pandas() - .equals(gd.Series(test_values).set_index(gdf._index).to_pandas()) + .equals(cudf.Series(test_values).set_index(gdf._index).to_pandas()) ) def test_from_pandas(): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - gdf = gd.DataFrame.from_pandas(df) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.DataFrame.from_pandas(df) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) s = df.x - gs = gd.Series.from_pandas(s) - assert isinstance(gs, gd.Series) + gs = cudf.Series.from_pandas(s) + assert isinstance(gs, cudf.Series) assert_eq(s, gs) @@ -1397,14 +1402,14 @@ def test_from_records(dtypes): h_ary = np.ndarray(shape=(10, 4), dtype=dtypes) rec_ary = h_ary.view(np.recarray) - gdf = gd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) + gdf = cudf.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) df = pd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame.from_records(rec_ary) + gdf = cudf.DataFrame.from_records(rec_ary) df = pd.DataFrame.from_records(rec_ary) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1426,9 +1431,9 @@ def test_from_records_index(columns, index): [("Rex", 9, 81.0), ("Fido", 3, 27.0)], dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], ) - gdf = gd.DataFrame.from_records(rec_ary, columns=columns, index=index) + gdf = cudf.DataFrame.from_records(rec_ary, columns=columns, index=index) df = pd.DataFrame.from_records(rec_ary, columns=columns, index=index) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1436,37 +1441,37 @@ def test_dataframe_construction_from_cupy_arrays(): h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) d_ary = cupy.asarray(h_ary) - gdf = gd.DataFrame(d_ary, columns=["a", "b", "c"]) + gdf = cudf.DataFrame(d_ary, columns=["a", "b", "c"]) df = pd.DataFrame(h_ary, columns=["a", "b", "c"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) df = pd.DataFrame(h_ary) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary, index=["a", "b"]) + gdf = cudf.DataFrame(d_ary, index=["a", "b"]) df = pd.DataFrame(h_ary, index=["a", "b"]) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) gdf = gdf.set_index(keys=0, drop=False) df = pd.DataFrame(h_ary) df = df.set_index(keys=0, drop=False) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) - gdf = gd.DataFrame(d_ary) + gdf = cudf.DataFrame(d_ary) gdf = gdf.set_index(keys=1, drop=False) df = pd.DataFrame(h_ary) df = df.set_index(keys=1, drop=False) - assert isinstance(gdf, gd.DataFrame) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) @@ -1476,7 +1481,7 @@ def test_dataframe_cupy_wrong_dimensions(): with pytest.raises( ValueError, match="records dimension expected 1 or 2 but found: 3" ): - gd.DataFrame(d_ary) + cudf.DataFrame(d_ary) def test_dataframe_cupy_array_wrong_index(): @@ -1487,19 +1492,19 @@ def test_dataframe_cupy_array_wrong_index(): match="Length mismatch: Expected axis has 2 elements, " "new values have 1 elements", ): - gd.DataFrame(d_ary, index=["a"]) + cudf.DataFrame(d_ary, index=["a"]) with pytest.raises( ValueError, match="Length mismatch: Expected axis has 2 elements, " "new values have 1 elements", ): - gd.DataFrame(d_ary, index="a") + cudf.DataFrame(d_ary, index="a") def test_index_in_dataframe_constructor(): a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - b = gd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) + b = cudf.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) assert_eq(a, b) assert_eq(a.loc[4:], b.loc[4:]) @@ -1520,14 +1525,14 @@ def test_from_arrow(nelem, data_type): padf = pa.Table.from_pandas( df, preserve_index=False ).replace_schema_metadata(None) - gdf = gd.DataFrame.from_arrow(padf) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.DataFrame.from_arrow(padf) + assert isinstance(gdf, cudf.DataFrame) assert_eq(df, gdf) s = pa.Array.from_pandas(df.a) - gs = gd.Series.from_arrow(s) - assert isinstance(gs, gd.Series) + gs = cudf.Series.from_arrow(s) + assert isinstance(gs, cudf.Series) # For some reason PyArrow to_pandas() converts to numpy array and has # better type compatibility @@ -1543,7 +1548,7 @@ def test_to_arrow(nelem, data_type): "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) pa_df = pa.Table.from_pandas( df, preserve_index=False @@ -1579,8 +1584,8 @@ def test_to_from_arrow_nulls(data_type): time_unit, _ = np.datetime_data(dtype) data_type = pa.timestamp(unit=time_unit) s1 = pa.array([1, None, 3, None, 5], type=data_type) - gs1 = gd.Series.from_arrow(s1) - assert isinstance(gs1, gd.Series) + gs1 = cudf.Series.from_arrow(s1) + assert isinstance(gs1, cudf.Series) # We have 64B padded buffers for nulls whereas Arrow returns a minimal # number of bytes, so only check the first byte in this case np.testing.assert_array_equal( @@ -1590,8 +1595,8 @@ def test_to_from_arrow_nulls(data_type): assert pa.Array.equals(s1, gs1.to_arrow()) s2 = pa.array([None, None, None, None, None], type=data_type) - gs2 = gd.Series.from_arrow(s2) - assert isinstance(gs2, gd.Series) + gs2 = cudf.Series.from_arrow(s2) + assert isinstance(gs2, cudf.Series) # We have 64B padded buffers for nulls whereas Arrow returns a minimal # number of bytes, so only check the first byte in this case np.testing.assert_array_equal( @@ -1604,7 +1609,7 @@ def test_to_from_arrow_nulls(data_type): def test_to_arrow_categorical(): df = pd.DataFrame() df["a"] = pd.Series(["a", "b", "c"], dtype="category") - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) pa_df = pa.Table.from_pandas( df, preserve_index=False @@ -1624,9 +1629,9 @@ def test_to_arrow_categorical(): def test_from_arrow_missing_categorical(): pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = gd.Series(pa_cat) + gd_cat = cudf.Series(pa_cat) - assert isinstance(gd_cat, gd.Series) + assert isinstance(gd_cat, cudf.Series) assert_eq( pd.Series(pa_cat.to_pandas()), # PyArrow returns a pd.Categorical gd_cat.to_pandas(), @@ -1636,9 +1641,9 @@ def test_from_arrow_missing_categorical(): def test_to_arrow_missing_categorical(): pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = gd.Series(pa_cat) + gd_cat = cudf.Series(pa_cat) - assert isinstance(gd_cat, gd.Series) + assert isinstance(gd_cat, cudf.Series) assert pa.Array.equals(pa_cat, gd_cat.to_arrow()) @@ -1651,14 +1656,12 @@ def test_from_scalar_typing(data_type): .astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): - from datetime import date - - scalar = np.datetime64(date.today()).astype("datetime64[ms]") + scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") data_type = "datetime64[ms]" else: scalar = np.dtype(data_type).type(np.random.randint(0, 5)) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["a"] = [1, 2, 3, 4, 5] gdf["b"] = scalar assert gdf["b"].dtype == np.dtype(data_type) @@ -1671,35 +1674,35 @@ def test_from_python_array(data_type): data = memoryview(np_arr) data = arr.array(data.format, data) - gs = gd.Series(data) + gs = cudf.Series(data) np.testing.assert_equal(gs.to_array(), np_arr) def test_series_shape(): ps = pd.Series([1, 2, 3, 4]) - cs = gd.Series([1, 2, 3, 4]) + cs = cudf.Series([1, 2, 3, 4]) assert ps.shape == cs.shape def test_series_shape_empty(): ps = pd.Series(dtype="float64") - cs = gd.Series([]) + cs = cudf.Series([]) assert ps.shape == cs.shape def test_dataframe_shape(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert pdf.shape == gdf.shape def test_dataframe_shape_empty(): pdf = pd.DataFrame() - gdf = gd.DataFrame() + gdf = cudf.DataFrame() assert pdf.shape == gdf.shape @@ -1709,14 +1712,12 @@ def test_dataframe_shape_empty(): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): - pdf = pd.DataFrame() - from string import ascii_lowercase null_rep = np.nan if dtype in ["float32", "float64"] else None for i in range(num_cols): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = pd.Series(np.random.randint(0, 26, num_rows).astype(dtype)) if nulls == "some": idx = np.random.choice( @@ -1727,7 +1728,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): data[:] = null_rep pdf[colname] = data - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got_function = gdf.transpose() got_property = gdf.T @@ -1742,15 +1743,14 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): @pytest.mark.parametrize("num_rows", [1, 2, 20]) def test_dataframe_transpose_category(num_cols, num_rows): pdf = pd.DataFrame() - from string import ascii_lowercase for i in range(num_cols): - colname = ascii_lowercase[i] - data = pd.Series(list(ascii_lowercase), dtype="category") + colname = string.ascii_lowercase[i] + data = pd.Series(list(string.ascii_lowercase), dtype="category") data = data.sample(num_rows, replace=True).reset_index(drop=True) pdf[colname] = data - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got_function = gdf.transpose() got_property = gdf.T @@ -1762,7 +1762,7 @@ def test_dataframe_transpose_category(num_cols, num_rows): def test_generated_column(): - gdf = gd.DataFrame({"a": (i for i in range(5))}) + gdf = cudf.DataFrame({"a": (i for i in range(5))}) assert len(gdf) == 5 @@ -1773,7 +1773,7 @@ def pdf(): @pytest.fixture def gdf(pdf): - return gd.DataFrame.from_pandas(pdf) + return cudf.DataFrame.from_pandas(pdf) @pytest.mark.parametrize( @@ -1813,9 +1813,7 @@ def gdf(pdf): @pytest.mark.parametrize("skipna", [True, False, None]) def test_dataframe_reductions(data, func, skipna): pdf = pd.DataFrame(data=data) - print(func(pdf, skipna=skipna)) - gdf = gd.DataFrame.from_pandas(pdf) - print(func(gdf, skipna=skipna)) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna)) @@ -1832,7 +1830,7 @@ def test_dataframe_reductions(data, func, skipna): @pytest.mark.parametrize("func", [lambda df: df.count()]) def test_dataframe_count_reduction(data, func): pdf = pd.DataFrame(data=data) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(func(pdf), func(gdf)) @@ -1852,7 +1850,7 @@ def test_dataframe_count_reduction(data, func): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10]) def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) - gsr = gd.DataFrame(data) + gsr = cudf.DataFrame(data) if psr.shape[0] * psr.shape[1] < min_count: pytest.xfail("https://github.com/pandas-dev/pandas/issues/39738") @@ -1952,7 +1950,7 @@ def test_unary_operators(func, pdf, gdf): def test_is_monotonic(gdf): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert not gdf.index.is_monotonic assert not gdf.index.is_monotonic_increasing assert not gdf.index.is_monotonic_decreasing @@ -1965,7 +1963,7 @@ def test_iter(pdf, gdf): def test_iteritems(gdf): for k, v in gdf.iteritems(): assert k in gdf.columns - assert isinstance(v, gd.Series) + assert isinstance(v, cudf.Series) assert_eq(v, gdf[k]) @@ -1977,7 +1975,7 @@ def test_quantile(q, numeric_only): pdf = pd.DataFrame( {"date": ts, "delta": td, "val": np.random.randn(len(ts))} ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) assert_eq(pdf["delta"].quantile(q), gdf["delta"].quantile(q)) @@ -1997,7 +1995,7 @@ def test_quantile(q, numeric_only): def test_empty_quantile(): pdf = pd.DataFrame({"x": []}) - df = gd.DataFrame({"x": []}) + df = cudf.DataFrame({"x": []}) actual = df.quantile() expected = pdf.quantile() @@ -2006,16 +2004,16 @@ def test_empty_quantile(): def test_from_pandas_function(pdf): - gdf = gd.from_pandas(pdf) - assert isinstance(gdf, gd.DataFrame) + gdf = cudf.from_pandas(pdf) + assert isinstance(gdf, cudf.DataFrame) assert_eq(pdf, gdf) - gdf = gd.from_pandas(pdf.x) - assert isinstance(gdf, gd.Series) + gdf = cudf.from_pandas(pdf.x) + assert isinstance(gdf, cudf.Series) assert_eq(pdf.x, gdf) with pytest.raises(TypeError): - gd.from_pandas(123) + cudf.from_pandas(123) @pytest.mark.parametrize("preserve_index", [True, False]) @@ -2030,7 +2028,7 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): assert pa.Table.equals(pdf_arrow_table, gdf_arrow_table) - gdf2 = gd.DataFrame.from_arrow(pdf_arrow_table) + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) pdf2 = pdf_arrow_table.to_pandas() assert_eq(pdf2, gdf2) @@ -2043,11 +2041,11 @@ def test_series_hash_encode(nrows): # results in enc_with_name_arr and enc_arr to be same. # And there is no other better way to make hash return same value. # So using an integer name to get constant value back from hash. - s = gd.Series(data, name=1) + s = cudf.Series(data, name=1) num_features = 1000 encoded_series = s.hash_encode(num_features) - assert isinstance(encoded_series, gd.Series) + assert isinstance(encoded_series, cudf.Series) enc_arr = encoded_series.to_array() assert np.all(enc_arr >= 0) assert np.max(enc_arr) < num_features @@ -2063,10 +2061,10 @@ def test_cuda_array_interface(dtype): cupy_data = cupy.array(np_data) pd_data = pd.Series(np_data) - cudf_data = gd.Series(cupy_data) + cudf_data = cudf.Series(cupy_data) assert_eq(pd_data, cudf_data) - gdf = gd.DataFrame() + gdf = cudf.DataFrame() gdf["test"] = cupy_data pd_data.name = "test" assert_eq(pd_data, gdf["test"]) @@ -2083,7 +2081,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): pa_chunk_array = pa.chunked_array(np_list_data) expect = pd.Series(pa_chunk_array.to_pandas()) - got = gd.Series(pa_chunk_array) + got = cudf.Series(pa_chunk_array) assert_eq(expect, got) @@ -2097,15 +2095,13 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ) expect = pa_table.to_pandas() - got = gd.DataFrame.from_arrow(pa_table) + got = cudf.DataFrame.from_arrow(pa_table) assert_eq(expect, got) @pytest.mark.skip(reason="Test was designed to be run in isolation") def test_gpu_memory_usage_with_boolmask(): - import cudf - ctx = cuda.current_context() def query_GPU_memory(note=""): @@ -2120,7 +2116,7 @@ def query_GPU_memory(note=""): colNames = ["col" + str(iCol) for iCol in range(nCols)] pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - boolmask = gd.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) + boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) memory_used = query_GPU_memory() cudaDF = cudaDF[boolmask] @@ -2163,8 +2159,8 @@ def test_dataframe_boolmask(mask_shape): pdf_mask = pd.DataFrame() for col in mask_shape[1]: pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0 - gdf = gd.DataFrame.from_pandas(pdf) - gdf_mask = gd.DataFrame.from_pandas(pdf_mask) + gdf = cudf.DataFrame.from_pandas(pdf) + gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) gdf = gdf[gdf_mask] pdf = pdf[pdf_mask] @@ -2180,7 +2176,7 @@ def test_dataframe_boolmask(mask_shape): [ [True, False, True], pytest.param( - gd.Series([True, False, True]), + cudf.Series([True, False, True]), marks=pytest.mark.xfail( reason="Pandas can't index a multiindex with a Series" ), @@ -2188,7 +2184,7 @@ def test_dataframe_boolmask(mask_shape): ], ) def test_dataframe_multiindex_boolmask(mask): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} ) gdg = gdf.groupby(["w", "x"]).count() @@ -2200,7 +2196,7 @@ def test_dataframe_assignment(): pdf = pd.DataFrame() for col in "abc": pdf[col] = np.array([0, 1, 1, -2, 10]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) gdf[gdf < 0] = 999 pdf[pdf < 0] = 999 assert_eq(gdf, pdf) @@ -2212,7 +2208,7 @@ def test_1row_arrow_table(): table = pa.Table.from_batches([batch]) expect = table.to_pandas() - got = gd.DataFrame.from_arrow(table) + got = cudf.DataFrame.from_arrow(table) assert_eq(expect, got) @@ -2221,7 +2217,7 @@ def test_arrow_handle_no_index_name(pdf, gdf): pdf_arrow = pa.Table.from_pandas(pdf) assert pa.Table.equals(pdf_arrow, gdf_arrow) - got = gd.DataFrame.from_arrow(gdf_arrow) + got = cudf.DataFrame.from_arrow(gdf_arrow) expect = pdf_arrow.to_pandas() assert_eq(expect, got) @@ -2234,9 +2230,9 @@ def test_arrow_handle_no_index_name(pdf, gdf): def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): data = np.random.randint(0, 100, num_rows).astype(dtype) bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) - s = gd.Series(data) + s = cudf.Series(data) if series_bins: - s_bins = gd.Series(bins) + s_bins = cudf.Series(bins) indices = s.digitize(s_bins, right) else: indices = s.digitize(bins, right) @@ -2246,8 +2242,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): def test_series_digitize_invalid_bins(): - s = gd.Series(np.random.randint(0, 30, 80), dtype="int32") - bins = gd.Series([2, None, None, 50, 90], dtype="int32") + s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + bins = cudf.Series([2, None, None, 50, 90], dtype="int32") with pytest.raises( ValueError, match="`bins` cannot contain null entries." @@ -2262,7 +2258,7 @@ def test_pandas_non_contiguious(): for col in df.columns: assert df[col].values.flags["C_CONTIGUOUS"] is False - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf.to_pandas(), df) @@ -2281,7 +2277,7 @@ def test_series_all_null(num_elements, null_type): # Typecast Pandas because None will return `object` dtype expect = pd.Series(data, dtype="float64") - got = gd.Series(data) + got = cudf.Series(data) assert_eq(expect, got) @@ -2289,13 +2285,13 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = gd.Series(data, nan_as_null=False) + sr = cudf.Series(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) def test_series_rename(): pds = pd.Series([1, 2, 3], name="asdf") - gds = gd.Series([1, 2, 3], name="asdf") + gds = cudf.Series([1, 2, 3], name="asdf") expect = pds.rename("new_name") got = gds.rename("new_name") @@ -2303,12 +2299,12 @@ def test_series_rename(): assert_eq(expect, got) pds = pd.Series(expect) - gds = gd.Series(got) + gds = cudf.Series(got) assert_eq(pds, gds) pds = pd.Series(expect, name="name name") - gds = gd.Series(got, name="name name") + gds = cudf.Series(got, name="name name") assert_eq(pds, gds) @@ -2329,7 +2325,7 @@ def check_frame_series_equality(left, right): check_index_equality(left, right) check_values_equality(left, right) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": np.random.randint(0, 1000, nelem).astype(data_type), "b": np.random.randint(0, 1000, nelem).astype(data_type), @@ -2356,9 +2352,9 @@ def check_frame_series_equality(left, right): def test_tail_for_string(): - gdf = gd.DataFrame() - gdf["id"] = gd.Series(["a", "b"], dtype=np.object) - gdf["v"] = gd.Series([1, 2]) + gdf = cudf.DataFrame() + gdf["id"] = cudf.Series(["a", "b"], dtype=np.object) + gdf["v"] = cudf.Series([1, 2]) assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) @@ -2441,7 +2437,7 @@ def test_reset_index_inplace(pdf, gdf, drop): @pytest.mark.parametrize("append", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) def test_set_index(data, index, drop, append, inplace): - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) @@ -2467,7 +2463,7 @@ def test_set_index(data, index, drop, append, inplace): @pytest.mark.parametrize("verify_integrity", [True]) @pytest.mark.xfail def test_set_index_verify_integrity(data, index, verify_integrity): - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) gdf.set_index(index, verify_integrity=verify_integrity) @@ -2486,7 +2482,7 @@ def test_set_index_multi(drop, nelem): } ) df["e"] = df["d"].astype("category") - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf.set_index("a", drop=drop), gdf.set_index(["a"], drop=drop)) assert_eq( @@ -2507,7 +2503,7 @@ def test_set_index_multi(drop, nelem): def test_dataframe_reindex_0(copy): # TODO (ptaylor): pandas changes `int` dtype to `float64` # when reindexing and filling new label indices with NaN - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={ "a": "category", @@ -2524,7 +2520,7 @@ def test_dataframe_reindex_0(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_1(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2535,7 +2531,7 @@ def test_dataframe_reindex_1(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_2(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2549,7 +2545,7 @@ def test_dataframe_reindex_2(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_3(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2563,7 +2559,7 @@ def test_dataframe_reindex_3(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_4(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2577,7 +2573,7 @@ def test_dataframe_reindex_4(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_5(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2591,7 +2587,7 @@ def test_dataframe_reindex_5(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_6(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2605,7 +2601,7 @@ def test_dataframe_reindex_6(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_7(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2619,7 +2615,7 @@ def test_dataframe_reindex_7(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_8(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2633,7 +2629,7 @@ def test_dataframe_reindex_8(copy): @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_9(copy): columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2648,7 +2644,7 @@ def test_dataframe_reindex_9(copy): def test_dataframe_reindex_10(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2668,7 +2664,7 @@ def test_dataframe_reindex_change_dtype(copy): kwargs = {} index = pd.date_range("12/29/2009", periods=10, freq="D") columns = ["a", "b", "c", "d", "e"] - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=6, dtypes={"a": "category", "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2684,7 +2680,7 @@ def test_dataframe_reindex_change_dtype(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_categorical_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"a": "category"}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) pdf = gdf.to_pandas() assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) assert_eq( @@ -2699,7 +2695,7 @@ def test_series_categorical_reindex(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_float_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"c": float}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) pdf = gdf.to_pandas() assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) assert_eq( @@ -2714,7 +2710,7 @@ def test_series_float_reindex(copy): @pytest.mark.parametrize("copy", [True, False]) def test_series_string_reindex(copy): index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = gd.datasets.randomdata(nrows=6, dtypes={"d": str}) + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) pdf = gdf.to_pandas() assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) assert_eq( @@ -2743,7 +2739,7 @@ def test_to_frame(pdf, gdf): def test_dataframe_empty_sort_index(): pdf = pd.DataFrame({"x": []}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expect = pdf.sort_index() got = gdf.sort_index() @@ -2763,7 +2759,7 @@ def test_dataframe_sort_index( {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=[3.0, 1.0, np.nan], ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) expected = pdf.sort_index( axis=axis, @@ -2819,7 +2815,7 @@ def test_dataframe_mulitindex_sort_index( "d": [1, 2, 8], } ).set_index(["b", "a", 1]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) # ignore_index is supported in v.1.0 expected = pdf.sort_index( @@ -2857,7 +2853,7 @@ def test_dataframe_0_row_dtype(dtype): else: data = np.array([1, 2, 3, 4, 5], dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["x"] = data expect["y"] = data got = expect.head(0) @@ -2865,7 +2861,7 @@ def test_dataframe_0_row_dtype(dtype): for col_name in got.columns: assert expect[col_name].dtype == got[col_name].dtype - expect = gd.Series(data) + expect = cudf.Series(data) got = expect.head(0) assert expect.dtype == got.dtype @@ -2876,7 +2872,7 @@ def test_series_list_nanasnull(nan_as_null): data = [1.0, 2.0, 3.0, np.nan, None] expect = pa.array(data, from_pandas=nan_as_null) - got = gd.Series(data, nan_as_null=nan_as_null).to_arrow() + got = cudf.Series(data, nan_as_null=nan_as_null).to_arrow() # Bug in Arrow 0.14.1 where NaNs aren't handled expect = expect.cast("int64", safe=False) @@ -2886,7 +2882,7 @@ def test_series_list_nanasnull(nan_as_null): def test_column_assignment(): - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=20, dtypes={"a": "category", "b": int, "c": float} ) new_cols = ["q", "r", "s"] @@ -2895,7 +2891,7 @@ def test_column_assignment(): def test_select_dtype(): - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} ) pdf = gdf.to_pandas() @@ -2953,7 +2949,9 @@ def test_select_dtype(): ), ) - gdf = gd.DataFrame({"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]}) + gdf = cudf.DataFrame( + {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} + ) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), @@ -2964,7 +2962,7 @@ def test_select_dtype(): gdf.select_dtypes(include=["object"], exclude=["category"]), ) - gdf = gd.DataFrame({"a": range(10), "b": range(10, 20)}) + gdf = cudf.DataFrame({"a": range(10), "b": range(10, 20)}) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["category"]), @@ -2998,8 +2996,8 @@ def test_select_dtype(): lfunc=pdf.select_dtypes, rfunc=gdf.select_dtypes, ) - gdf = gd.DataFrame( - {"a": gd.Series([], dtype="int"), "b": gd.Series([], dtype="str")} + gdf = cudf.DataFrame( + {"a": cudf.Series([], dtype="int"), "b": cudf.Series([], dtype="str")} ) pdf = gdf.to_pandas() assert_eq( @@ -3013,7 +3011,7 @@ def test_select_dtype(): def test_select_dtype_datetime(): - gdf = gd.datasets.timeseries( + gdf = cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} ) gdf = gdf.reset_index() @@ -3031,7 +3029,7 @@ def test_select_dtype_datetime(): def test_select_dtype_datetime_with_frequency(): - gdf = gd.datasets.timeseries( + gdf = cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} ) gdf = gdf.reset_index() @@ -3046,7 +3044,7 @@ def test_select_dtype_datetime_with_frequency(): def test_array_ufunc(): - gdf = gd.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]}) + gdf = cudf.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]}) pdf = gdf.to_pandas() assert_eq(np.sqrt(gdf), np.sqrt(pdf)) @@ -3056,7 +3054,7 @@ def test_array_ufunc(): @pytest.mark.parametrize("nan_value", [-5, -5.0, 0, 5, 5.0, None, "pandas"]) def test_series_to_gpu_array(nan_value): - s = gd.Series([0, 1, None, 3]) + s = cudf.Series([0, 1, None, 3]) np.testing.assert_array_equal( s.to_array(nan_value), s.to_gpu_array(nan_value).copy_to_host() ) @@ -3066,7 +3064,7 @@ def test_dataframe_describe_exclude(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3081,7 +3079,7 @@ def test_dataframe_describe_include(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3096,7 +3094,7 @@ def test_dataframe_describe_default(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() @@ -3110,7 +3108,7 @@ def test_series_describe_include_all(): np.random.seed(12) data_length = 10000 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) @@ -3134,7 +3132,7 @@ def test_dataframe_describe_percentiles(): data_length = 10000 sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() @@ -3148,7 +3146,7 @@ def test_get_numeric_data(): pdf = pd.DataFrame( {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} ) - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) @@ -3167,7 +3165,7 @@ def test_shift(dtype, period, data_empty): else: data = gen_rand(dtype, 100000) - gdf = gd.DataFrame({"a": gd.Series(data, dtype=dtype)}) + gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) shifted_outcome = gdf.a.shift(period).fillna(0) @@ -3192,7 +3190,7 @@ def test_diff(dtype, period, data_empty): else: data = gen_rand(dtype, 100000) - gdf = gd.DataFrame({"a": gd.Series(data, dtype=dtype)}) + gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) expected_outcome = pdf.a.diff(period) @@ -3208,7 +3206,7 @@ def test_diff(dtype, period, data_empty): @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_isnull_isna(df, nan_as_null): - gdf = gd.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.isnull(), gdf.isnull()) assert_eq(df.isna(), gdf.isna()) @@ -3223,7 +3221,7 @@ def test_dataframe_isnull_isna(df, nan_as_null): @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_notna_notnull(df, nan_as_null): - gdf = gd.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.notnull(), gdf.notnull()) assert_eq(df.notna(), gdf.notna()) @@ -3236,12 +3234,12 @@ def test_dataframe_notna_notnull(df, nan_as_null): def test_ndim(): pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert pdf.ndim == gdf.ndim assert pdf.x.ndim == gdf.x.ndim s = pd.Series(dtype="float64") - gs = gd.Series() + gs = cudf.Series() assert s.ndim == gs.ndim @@ -3252,7 +3250,7 @@ def test_ndim(): 0, 5, pd.Series([1, 4, 3, -6], index=["w", "x", "y", "z"]), - gd.Series([-4, -2, 12], index=["x", "y", "z"]), + cudf.Series([-4, -2, 12], index=["x", "y", "z"]), {"w": -1, "x": 15, "y": 2}, ], ) @@ -3278,9 +3276,9 @@ def test_dataframe_round(decimals): "z": np.repeat([-0.6459412758761901], 10), } ) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) - if isinstance(decimals, gd.Series): + if isinstance(decimals, cudf.Series): pdecimals = decimals.to_pandas() else: pdecimals = decimals @@ -3333,13 +3331,13 @@ def test_all(data): # Pandas treats `None` in object type columns as True for some reason, so # replacing with `False` if np.array(data).ndim <= 1: - pdata = pd.Series( - data, dtype=None if len(data) else "float64" - ).replace([None], False) - gdata = gd.Series.from_pandas(pdata) + pdata = cudf.utils.utils.create_pandas_series(data=data).replace( + [None], False + ) + gdata = cudf.Series.from_pandas(pdata) else: pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False) - gdata = gd.DataFrame.from_pandas(pdata) + gdata = cudf.DataFrame.from_pandas(pdata) # test bool_only if pdata["b"].dtype == "bool": @@ -3388,8 +3386,8 @@ def test_all(data): @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): if np.array(data).ndim <= 1: - pdata = pd.Series(data, dtype=None if len(data) else "float64") - gdata = gd.Series.from_pandas(pdata) + pdata = cudf.utils.utils.create_pandas_series(data=data) + gdata = cudf.Series.from_pandas(pdata) if axis == 1: with pytest.raises(NotImplementedError): @@ -3400,7 +3398,7 @@ def test_any(data, axis): assert_eq(got, expected) else: pdata = pd.DataFrame(data, columns=["a", "b"]) - gdata = gd.DataFrame.from_pandas(pdata) + gdata = cudf.DataFrame.from_pandas(pdata) # test bool_only if pdata["b"].dtype == "bool": @@ -3421,7 +3419,7 @@ def test_any(data, axis): @pytest.mark.parametrize("axis", [0, 1]) def test_empty_dataframe_any(axis): pdf = pd.DataFrame({}, columns=["a", "b"]) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) got = gdf.any(axis=axis) expected = pdf.any(axis=axis) assert_eq(got, expected, check_index_type=False) @@ -3432,7 +3430,7 @@ def test_dataframe_sizeof(indexed): rows = int(1e6) index = list(i for i in range(rows)) if indexed else None - gdf = gd.DataFrame({"A": [8] * rows, "B": [32] * rows}, index=index) + gdf = cudf.DataFrame({"A": [8] * rows, "B": [32] * rows}, index=index) for c in gdf._data.columns: assert gdf._index.__sizeof__() == gdf._index.__sizeof__() @@ -3449,19 +3447,19 @@ def test_dataframe_sizeof(indexed): @pytest.mark.parametrize("non_list_data", [123, "abc", "zyx", "rapids", 0.8]) def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): expected = pd.DataFrame({"a": a}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = b actual["b"] = b assert_eq(actual, expected) expected = pd.DataFrame({"a": []}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = misc_data actual["b"] = misc_data assert_eq(actual, expected) expected = pd.DataFrame({"a": a}) - actual = gd.DataFrame.from_pandas(expected) + actual = cudf.DataFrame.from_pandas(expected) expected["b"] = non_list_data actual["b"] = non_list_data assert_eq(actual, expected) @@ -3469,7 +3467,7 @@ def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): def test_empty_dataframe_describe(): pdf = pd.DataFrame({"a": [], "b": []}) - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expected = pdf.describe() actual = gdf.describe() @@ -3478,75 +3476,77 @@ def test_empty_dataframe_describe(): def test_as_column_types(): - from cudf.core.column import column - - col = column.as_column(gd.Series([])) + col = column.as_column(cudf.Series([])) assert_eq(col.dtype, np.dtype("float64")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="float32") + col = column.as_column(cudf.Series([]), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="str") + col = column.as_column(cudf.Series([]), dtype="str") assert_eq(col.dtype, np.dtype("object")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) - col = column.as_column(gd.Series([]), dtype="object") + col = column.as_column(cudf.Series([]), dtype="object") assert_eq(col.dtype, np.dtype("object")) - gds = gd.Series(col) + gds = cudf.Series(col) pds = pd.Series(pd.Series([], dtype="object")) assert_eq(pds, gds) pds = pd.Series(np.array([1, 2, 3]), dtype="float32") - gds = gd.Series(column.as_column(np.array([1, 2, 3]), dtype="float32")) + gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32")) assert_eq(pds, gds) pds = pd.Series([1, 2, 3], dtype="float32") - gds = gd.Series([1, 2, 3], dtype="float32") + gds = cudf.Series([1, 2, 3], dtype="float32") assert_eq(pds, gds) pds = pd.Series([], dtype="float64") - gds = gd.Series(column.as_column(pds)) + gds = cudf.Series(column.as_column(pds)) assert_eq(pds, gds) pds = pd.Series([1, 2, 4], dtype="int64") - gds = gd.Series(column.as_column(gd.Series([1, 2, 4]), dtype="int64")) + gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64")) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = gd.Series( - column.as_column(gd.Series([1.2, 18.0, 9.0]), dtype="float32") + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = gd.Series(column.as_column(gd.Series([1.2, 18.0, 9.0]), dtype="str")) + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") + ) assert_eq(pds, gds) pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int") - gds = gd.Series(gd.core.index.StringIndex(["1", "18", "9"]), dtype="int") + gds = cudf.Series( + cudf.core.index.StringIndex(["1", "18", "9"]), dtype="int" + ) assert_eq(pds, gds) def test_one_row_head(): - gdf = gd.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) + gdf = cudf.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) pdf = gdf.to_pandas() head_gdf = gdf.head() @@ -3559,7 +3559,7 @@ def test_one_row_head(): @pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) def test_series_astype_numeric_to_numeric(dtype, as_dtype): psr = pd.Series([1, 2, 4, 3], dtype=dtype) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3567,9 +3567,9 @@ def test_series_astype_numeric_to_numeric(dtype, as_dtype): @pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): data = [1, 2, None, 3] - sr = gd.Series(data, dtype=dtype) + sr = cudf.Series(data, dtype=dtype) got = sr.astype(as_dtype) - expect = gd.Series([1, 2, None, 3], dtype=as_dtype) + expect = cudf.Series([1, 2, None, 3], dtype=as_dtype) assert_eq(expect, got) @@ -3587,7 +3587,7 @@ def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): ) def test_series_astype_numeric_to_other(dtype, as_dtype): psr = pd.Series([1, 2, 3], dtype=dtype) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3611,7 +3611,7 @@ def test_series_astype_string_to_other(as_dtype): else: data = ["1", "2", "3"] psr = pd.Series(data) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3628,7 +3628,7 @@ def test_series_astype_string_to_other(as_dtype): def test_series_astype_datetime_to_other(as_dtype): data = ["2001-01-01", "2002-02-02", "2001-01-05"] psr = pd.Series(data) - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @@ -3644,7 +3644,7 @@ def test_series_astype_datetime_to_other(as_dtype): def test_series_astype_datetime_to_string(inp): dtype, expect = inp base_date = "2011-01-01" - sr = gd.Series([base_date], dtype=dtype) + sr = cudf.Series([base_date], dtype=dtype) got = sr.astype(str)[0] assert expect == got @@ -3669,19 +3669,19 @@ def test_series_astype_categorical_to_other(as_dtype): else: data = [1, 2, 3, 1] psr = pd.Series(data, dtype="category") - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) @pytest.mark.parametrize("ordered", [True, False]) def test_series_astype_to_categorical_ordered(ordered): psr = pd.Series([1, 2, 3, 1], dtype="category") - gsr = gd.from_pandas(psr) + gsr = cudf.from_pandas(psr) ordered_dtype_pd = pd.CategoricalDtype( categories=[1, 2, 3], ordered=ordered ) - ordered_dtype_gd = gd.CategoricalDtype.from_pandas(ordered_dtype_pd) + ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), gsr.astype("int32").astype(ordered_dtype_gd).astype("int32"), @@ -3694,11 +3694,11 @@ def test_series_astype_cat_ordered_to_unordered(ordered): pd_to_dtype = pd.CategoricalDtype( categories=[1, 2, 3], ordered=not ordered ) - gd_dtype = gd.CategoricalDtype.from_pandas(pd_dtype) - gd_to_dtype = gd.CategoricalDtype.from_pandas(pd_to_dtype) + gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) + gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) psr = pd.Series([1, 2, 3], dtype=pd_dtype) - gsr = gd.Series([1, 2, 3], dtype=gd_dtype) + gsr = cudf.Series([1, 2, 3], dtype=gd_dtype) expect = psr.astype(pd_to_dtype) got = gsr.astype(gd_to_dtype) @@ -3710,62 +3710,63 @@ def test_series_astype_null_cases(): data = [1, 2, None, 3] # numerical to other - assert_eq(gd.Series(data, dtype="str"), gd.Series(data).astype("str")) + assert_eq(cudf.Series(data, dtype="str"), cudf.Series(data).astype("str")) assert_eq( - gd.Series(data, dtype="category"), gd.Series(data).astype("category") + cudf.Series(data, dtype="category"), + cudf.Series(data).astype("category"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="int32").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="int32").astype("float32"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="uint32").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="uint32").astype("float32"), ) assert_eq( - gd.Series(data, dtype="datetime64[ms]"), - gd.Series(data).astype("datetime64[ms]"), + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data).astype("datetime64[ms]"), ) # categorical to other assert_eq( - gd.Series(data, dtype="str"), - gd.Series(data, dtype="category").astype("str"), + cudf.Series(data, dtype="str"), + cudf.Series(data, dtype="category").astype("str"), ) assert_eq( - gd.Series(data, dtype="float32"), - gd.Series(data, dtype="category").astype("float32"), + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="category").astype("float32"), ) assert_eq( - gd.Series(data, dtype="datetime64[ms]"), - gd.Series(data, dtype="category").astype("datetime64[ms]"), + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data, dtype="category").astype("datetime64[ms]"), ) # string to other assert_eq( - gd.Series([1, 2, None, 3], dtype="int32"), - gd.Series(["1", "2", None, "3"]).astype("int32"), + cudf.Series([1, 2, None, 3], dtype="int32"), + cudf.Series(["1", "2", None, "3"]).astype("int32"), ) assert_eq( - gd.Series( + cudf.Series( ["2001-01-01", "2001-02-01", None, "2001-03-01"], dtype="datetime64[ms]", ), - gd.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( + cudf.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( "datetime64[ms]" ), ) assert_eq( - gd.Series(["a", "b", "c", None], dtype="category").to_pandas(), - gd.Series(["a", "b", "c", None]).astype("category").to_pandas(), + cudf.Series(["a", "b", "c", None], dtype="category").to_pandas(), + cudf.Series(["a", "b", "c", None]).astype("category").to_pandas(), ) # datetime to other @@ -3776,20 +3777,21 @@ def test_series_astype_null_cases(): "2001-03-01 00:00:00.000000", ] assert_eq( - gd.Series(data), gd.Series(data, dtype="datetime64[us]").astype("str"), + cudf.Series(data), + cudf.Series(data, dtype="datetime64[us]").astype("str"), ) assert_eq( pd.Series(data, dtype="datetime64[ns]").astype("category"), - gd.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( + cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( "category" ), ) def test_series_astype_null_categorical(): - sr = gd.Series([None, None, None], dtype="category") - expect = gd.Series([None, None, None], dtype="int32") + sr = cudf.Series([None, None, None], dtype="category") + expect = cudf.Series([None, None, None], dtype="int32") got = sr.astype("int32") assert_eq(expect, got) @@ -3813,19 +3815,19 @@ def test_series_astype_null_categorical(): ) def test_create_dataframe_from_list_like(data): pdf = pd.DataFrame(data, index=["count", "mean", "std", "min"]) - gdf = gd.DataFrame(data, index=["count", "mean", "std", "min"]) + gdf = cudf.DataFrame(data, index=["count", "mean", "std", "min"]) assert_eq(pdf, gdf) pdf = pd.DataFrame(data) - gdf = gd.DataFrame(data) + gdf = cudf.DataFrame(data) assert_eq(pdf, gdf) def test_create_dataframe_column(): pdf = pd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - gdf = gd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) + gdf = cudf.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) assert_eq(pdf, gdf) @@ -3834,7 +3836,7 @@ def test_create_dataframe_column(): columns=["a", "b", "c"], index=["A", "Z", "X"], ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( {"a": [1, 2, 3], "b": [2, 3, 5]}, columns=["a", "b", "c"], index=["A", "Z", "X"], @@ -3854,8 +3856,8 @@ def test_create_dataframe_column(): ], ) def test_series_values_host_property(data): - pds = pd.Series(data, dtype=None if len(data) else "float64") - gds = gd.Series(data) + pds = cudf.utils.utils.create_pandas_series(data=data) + gds = cudf.Series(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -3877,8 +3879,8 @@ def test_series_values_host_property(data): ], ) def test_series_values_property(data): - pds = pd.Series(data, dtype=None if len(data) else "float64") - gds = gd.Series(data) + pds = cudf.utils.utils.create_pandas_series(data=data) + gds = cudf.Series(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) @@ -3923,7 +3925,7 @@ def test_series_values_property(data): ) def test_df_values_property(data): pdf = pd.DataFrame.from_dict(data) - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pmtr = pdf.values gmtr = gdf.values.get() @@ -3939,7 +3941,7 @@ def test_value_counts(): } ) - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10, "alpha": ["u", "h", "d", "a", "m", "u", "h", "d", "a"] * 10, @@ -3985,8 +3987,8 @@ def test_value_counts(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") - gsr = gd.Series.from_pandas(psr, nan_as_null=False) + psr = cudf.utils.utils.create_pandas_series(data=data, index=index) + gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) got = gsr.isin(values) @@ -4039,8 +4041,8 @@ def test_isin_numeric(data, values): ], ) def test_isin_datetime(data, values): - psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils.create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4068,8 +4070,8 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = pd.Series(data, dtype=None if len(data) else "float64") - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils.create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4097,8 +4099,8 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = pd.Series(data, dtype=None if len(data) else "float64") - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils.create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) expected = psr.isin(values) @@ -4131,8 +4133,8 @@ def test_isin_categorical(data, values): ], ) def test_isin_index(data, values): - psr = pd.Series(data, dtype=None if len(data) else "float64") - gsr = gd.Series.from_pandas(psr) + psr = cudf.utils.utils.create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) expected = psr.index.isin(values) @@ -4194,12 +4196,12 @@ def test_isin_index(data, values): ) def test_isin_multiindex(data, values, level, err): pmdx = data - gmdx = gd.from_pandas(data) + gmdx = cudf.from_pandas(data) if err is None: expected = pmdx.isin(values, level=level) if isinstance(values, pd.MultiIndex): - values = gd.from_pandas(values) + values = cudf.from_pandas(values) got = gmdx.isin(values, level=level) assert_eq(got, expected) @@ -4273,12 +4275,10 @@ def test_isin_multiindex(data, values, level, err): ], ) def test_isin_dataframe(data, values): - from cudf.utils.dtypes import is_scalar - pdf = data - gdf = gd.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) - if is_scalar(values): + if cudf.utils.dtypes.is_scalar(values): assert_exceptions_equal( lfunc=pdf.isin, rfunc=gdf.isin, @@ -4289,14 +4289,14 @@ def test_isin_dataframe(data, values): expected = pdf.isin(values) if isinstance(values, (pd.DataFrame, pd.Series)): - values = gd.from_pandas(values) + values = cudf.from_pandas(values) got = gdf.isin(values) assert_eq(got, expected) def test_constructor_properties(): - df = gd.DataFrame() + df = cudf.DataFrame() key1 = "a" key2 = "b" val1 = np.array([123], dtype=np.float64) @@ -4307,16 +4307,16 @@ def test_constructor_properties(): # Correct use of _constructor (for DataFrame) assert_eq(df, df._constructor({key1: val1, key2: val2})) - # Correct use of _constructor (for gd.Series) + # Correct use of _constructor (for cudf.Series) assert_eq(df[key1], df[key2]._constructor(val1, name=key1)) # Correct use of _constructor_sliced (for DataFrame) assert_eq(df[key1], df._constructor_sliced(val1, name=key1)) - # Correct use of _constructor_expanddim (for gd.Series) + # Correct use of _constructor_expanddim (for cudf.Series) assert_eq(df, df[key2]._constructor_expanddim({key1: val1, key2: val2})) - # Incorrect use of _constructor_sliced (Raises for gd.Series) + # Incorrect use of _constructor_sliced (Raises for cudf.Series) with pytest.raises(NotImplementedError): df[key1]._constructor_sliced @@ -4335,14 +4335,14 @@ def test_df_astype_numeric_to_all(dtype, as_dtype): elif "float" in dtype: data = [1.0, 2.0, None, 4.0, np.nan, -7.0] - gdf = gd.DataFrame() + gdf = cudf.DataFrame() - gdf["foo"] = gd.Series(data, dtype=dtype) - gdf["bar"] = gd.Series(data, dtype=dtype) + gdf["foo"] = cudf.Series(data, dtype=dtype) + gdf["bar"] = cudf.Series(data, dtype=dtype) - insert_data = gd.Series(data, dtype=dtype) + insert_data = cudf.Series(data, dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["foo"] = insert_data.astype(as_dtype) expect["bar"] = insert_data.astype(as_dtype) @@ -4375,11 +4375,11 @@ def test_df_astype_string_to_other(as_dtype): elif "float" in as_dtype: data = [1.0, 2.0, 3.0, np.nan] - insert_data = gd.Series.from_pandas(pd.Series(data, dtype="str")) - expect_data = gd.Series(data, dtype=as_dtype) + insert_data = cudf.Series.from_pandas(pd.Series(data, dtype="str")) + expect_data = cudf.Series(data, dtype=as_dtype) - gdf = gd.DataFrame() - expect = gd.DataFrame() + gdf = cudf.DataFrame() + expect = cudf.DataFrame() gdf["foo"] = insert_data gdf["bar"] = insert_data @@ -4410,28 +4410,28 @@ def test_df_astype_datetime_to_other(as_dtype): None, ] - gdf = gd.DataFrame() - expect = gd.DataFrame() + gdf = cudf.DataFrame() + expect = cudf.DataFrame() - gdf["foo"] = gd.Series(data, dtype="datetime64[ms]") - gdf["bar"] = gd.Series(data, dtype="datetime64[ms]") + gdf["foo"] = cudf.Series(data, dtype="datetime64[ms]") + gdf["bar"] = cudf.Series(data, dtype="datetime64[ms]") if as_dtype == "int64": - expect["foo"] = gd.Series( + expect["foo"] = cudf.Series( [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) - expect["bar"] = gd.Series( + expect["bar"] = cudf.Series( [690595200000, 1102118400000, 1473724800000, None], dtype="int64" ) elif as_dtype == "str": - expect["foo"] = gd.Series(data, dtype="str") - expect["bar"] = gd.Series(data, dtype="str") + expect["foo"] = cudf.Series(data, dtype="str") + expect["bar"] = cudf.Series(data, dtype="str") elif as_dtype == "category": - expect["foo"] = gd.Series(gdf["foo"], dtype="category") - expect["bar"] = gd.Series(gdf["bar"], dtype="category") + expect["foo"] = cudf.Series(gdf["foo"], dtype="category") + expect["bar"] = cudf.Series(gdf["bar"], dtype="category") else: - expect["foo"] = gd.Series(data, dtype=as_dtype) - expect["bar"] = gd.Series(data, dtype=as_dtype) + expect["foo"] = cudf.Series(data, dtype=as_dtype) + expect["bar"] = cudf.Series(data, dtype=as_dtype) got = gdf.astype(as_dtype) @@ -4460,7 +4460,7 @@ def test_df_astype_categorical_to_other(as_dtype): pdf = pd.DataFrame() pdf["foo"] = psr pdf["bar"] = psr - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.astype(as_dtype), gdf.astype(as_dtype)) @@ -4470,12 +4470,12 @@ def test_df_astype_to_categorical_ordered(ordered): pdf = pd.DataFrame() pdf["foo"] = psr pdf["bar"] = psr - gdf = gd.DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) ordered_dtype_pd = pd.CategoricalDtype( categories=[1, 2, 3], ordered=ordered ) - ordered_dtype_gd = gd.CategoricalDtype.from_pandas(ordered_dtype_pd) + ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( pdf.astype(ordered_dtype_pd).astype("int32"), @@ -4489,7 +4489,7 @@ def test_df_astype_to_categorical_ordered(ordered): + [("category", {"ordered": True}), ("category", {"ordered": False})], ) def test_empty_df_astype(dtype, args): - df = gd.DataFrame() + df = cudf.DataFrame() kwargs = {} kwargs.update(args) assert_eq(df, df.astype(dtype=dtype, **kwargs)) @@ -4509,7 +4509,7 @@ def test_empty_df_astype(dtype, args): ], ) def test_series_astype_error_handling(errors): - sr = gd.Series(["random", "words"]) + sr = cudf.Series(["random", "words"]) got = sr.astype("datetime64", errors=errors) assert_eq(sr, got) @@ -4527,12 +4527,12 @@ def test_df_constructor_dtype(dtype): else: data = [1, 2, 3, None] - sr = gd.Series(data, dtype=dtype) + sr = cudf.Series(data, dtype=dtype) - expect = gd.DataFrame() + expect = cudf.DataFrame() expect["foo"] = sr expect["bar"] = sr - got = gd.DataFrame({"foo": data, "bar": data}, dtype=dtype) + got = cudf.DataFrame({"foo": data, "bar": data}, dtype=dtype) assert_eq(expect, got) @@ -4540,31 +4540,31 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize( "data", [ - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} ), - gd.DataFrame(), - gd.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), - gd.DataFrame( + cudf.DataFrame(), + cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), + cudf.DataFrame( { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], "c": [np.NaN, np.NaN, np.NaN, np.NaN], - "d": gd.Series([None, None, None, None], dtype="int64"), + "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": gd.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), } ), - gd.DataFrame( + cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False ), } @@ -4593,18 +4593,18 @@ def test_rowwise_ops(data, op, skipna): "op", ["max", "min", "sum", "product", "mean", "var", "std"] ) def test_rowwise_ops_nullable_dtypes_all_null(op): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], "c": [np.NaN, np.NaN, np.NaN, np.NaN], - "d": gd.Series([None, None, None, None], dtype="int64"), + "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": gd.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), } ) - expected = gd.Series([None, None, None, None], dtype="float64") + expected = cudf.Series([None, None, None, None], dtype="float64") if op in ("var", "std"): got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) @@ -4620,7 +4620,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): [ ( "max", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 2234.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4628,7 +4628,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "min", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 13.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4636,7 +4636,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "sum", - gd.Series( + cudf.Series( [20.0, None, np.NaN, 2247.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4644,7 +4644,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "product", - gd.Series( + cudf.Series( [100.0, None, np.NaN, 29042.0, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4652,7 +4652,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "mean", - gd.Series( + cudf.Series( [10.0, None, np.NaN, 1123.5, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4660,7 +4660,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "var", - gd.Series( + cudf.Series( [0.0, None, np.NaN, 1233210.25, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4668,7 +4668,7 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ), ( "std", - gd.Series( + cudf.Series( [0.0, None, np.NaN, 1110.5, None, np.NaN], dtype="float64", nan_as_null=False, @@ -4677,10 +4677,10 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): ], ) def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False, ), } @@ -4698,38 +4698,44 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): @pytest.mark.parametrize( "op,expected", [ - ("max", gd.Series([10, None, None, 2234, None, 453], dtype="int64",),), - ("min", gd.Series([10, None, None, 13, None, 15], dtype="int64",),), - ("sum", gd.Series([20, None, None, 2247, None, 468], dtype="int64",),), + ( + "max", + cudf.Series([10, None, None, 2234, None, 453], dtype="int64",), + ), + ("min", cudf.Series([10, None, None, 13, None, 15], dtype="int64",),), + ( + "sum", + cudf.Series([20, None, None, 2247, None, 468], dtype="int64",), + ), ( "product", - gd.Series([100, None, None, 29042, None, 6795], dtype="int64",), + cudf.Series([100, None, None, 29042, None, 6795], dtype="int64",), ), ( "mean", - gd.Series( + cudf.Series( [10.0, None, None, 1123.5, None, 234.0], dtype="float32", ), ), ( "var", - gd.Series( + cudf.Series( [0.0, None, None, 1233210.25, None, 47961.0], dtype="float32", ), ), ( "std", - gd.Series( + cudf.Series( [0.0, None, None, 1110.5, None, 219.0], dtype="float32", ), ), ], ) def test_rowwise_ops_nullable_int_dtypes(op, expected): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "a": [10, 11, None, 13, None, 15], - "b": gd.Series( + "b": cudf.Series( [10, None, 323, 2234, None, 453], nan_as_null=False, ), } @@ -4748,62 +4754,62 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): "data", [ { - "t1": gd.Series( + "t1": cudf.Series( ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=" 0: if nulls == "some": @@ -429,14 +429,14 @@ def test_datetime_unique(data, nulls): @pytest.mark.parametrize( "data", [ - [], + pd.Series([], dtype="datetime64[ns]"), pd.Series(pd.date_range("2010-01-01", "2010-02-01")), pd.Series([None, None], dtype="datetime64[ns]"), ], ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_nunique(data, nulls): - psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") + psr = data.copy() if len(data) > 0: if nulls == "some": diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 92e70543cbe..b354f6b2f8a 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -22,7 +22,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index b4a45ed001b..eb8fb1db46f 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -56,7 +56,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = Series(data, dtype=None if len(data) else "float64") + pds = cudf.utils.utils.create_pandas_series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index adb6bb33763..000bd87803d 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -961,7 +961,7 @@ def test_index_equal_misc(data, other): assert_eq(expected, actual) expected = pd_data.equals( - pd.Series(pd_other, dtype=None if len(pd_other) else "float64") + cudf.utils.utils.create_pandas_series(data=pd_other) ) actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 1dd3a5c1c8c..9cf8b3ac239 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -159,7 +159,7 @@ def test_integer_dataframe(x): @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x) - ps = pd.Series(x, dtype=None if len(x) else "float64") + ps = cudf.utils.utils.create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() @@ -176,7 +176,7 @@ def test_float_dataframe(x): @settings(deadline=None) def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) - ps = pd.Series(x, dtype=None if len(x) else "float64") + ps = cudf.utils.utils.create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index c701e863c35..27236910ebb 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -39,7 +39,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data, index=index) gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -214,7 +214,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b6210be62f3..d62942c2364 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -384,7 +384,7 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = pd.Series(data, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data) gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) @@ -482,7 +482,7 @@ def test_series_factorize(data, na_sentinel): @pytest.mark.parametrize( "data", [ - [], + pd.Series([], dtype="datetime64[ns]"), pd.Series(pd.date_range("2010-01-01", "2010-02-01")), pd.Series([None, None], dtype="datetime64[ns]"), ], @@ -491,7 +491,7 @@ def test_series_factorize(data, na_sentinel): @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_series_datetime_value_counts(data, nulls, normalize, dropna): - psr = pd.Series(data, dtype=None if len(data) else "datetime64[ns]") + psr = data.copy() if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index e8483e44462..1512c87d160 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -6,6 +6,7 @@ import pandas as pd import pytest +import cudf from cudf.core import Series from cudf.datasets import randomdata from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -204,7 +205,7 @@ def test_approx_quantiles_int(): @pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]]) def test_misc_quantiles(data, q): - pdf_series = pd.Series(data, dtype=None if len(data) else "float64") + pdf_series = cudf.utils.utils.create_pandas_series(data=data) gdf_series = Series(data) expected = pdf_series.quantile(q) @@ -434,13 +435,13 @@ def test_df_corr(): ) @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): - psr = pd.Series(data, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data) gsr = Series(data) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = pd.Series(data, dtype=None if len(data) else "float64") + psr = cudf.utils.utils.create_pandas_series(data=data) gsr = Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 74622a8ceb2..b0a1aff4ada 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + import functools from collections import OrderedDict from collections.abc import Sequence @@ -622,3 +623,18 @@ def _categorical_scalar_broadcast_to(cat_scalar, size): offset=codes.offset, ordered=ordered, ) + + +def create_pandas_series( + data=None, index=None, dtype=None, name=None, copy=False, fastpath=False +): + if (data is None or len(data) == 0) and dtype is None: + dtype = "float64" + return pd.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) From ea6173301efb8f04bac59b7a2d4893a62cb36a27 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 10:23:25 -0800 Subject: [PATCH 16/35] remove is_scalar check --- python/cudf/cudf/core/column/categorical.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 99da0f9970c..0649f82256e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -946,12 +946,7 @@ def unary_operator(self, unaryop: str): ) def __setitem__(self, key, value): - if cudf.utils.dtypes.is_scalar(value): - new_values = [value] - else: - new_values = value - - to_add_categories = cudf.Index(new_values).difference(self.categories) + to_add_categories = cudf.Index(value).difference(self.categories) if ( len(to_add_categories) From d8ca966f426fc70175068fcd59667b10423edf47 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 12:35:40 -0800 Subject: [PATCH 17/35] version all pytest xfails --- python/cudf/cudf/tests/test_dataframe.py | 29 +++- python/cudf/cudf/tests/test_index.py | 32 ++++ python/cudf/cudf/tests/test_indexing.py | 6 +- python/cudf/cudf/tests/test_joining.py | 6 + python/cudf/cudf/tests/test_json.py | 14 +- python/cudf/cudf/tests/test_numerical.py | 9 ++ python/cudf/cudf/tests/test_reshape.py | 12 +- python/cudf/cudf/tests/test_setitem.py | 6 + python/cudf/cudf/tests/test_string.py | 149 ++++++++++++------ python/cudf/cudf/tests/test_timedelta.py | 47 +++++- .../dask_cudf/dask_cudf/tests/test_groupby.py | 15 +- 11 files changed, 267 insertions(+), 58 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d792c62a247..b72b3338342 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -18,7 +18,7 @@ from numba import cuda import cudf -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column from cudf.tests import utils from cudf.tests.utils import ( @@ -1852,7 +1852,7 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) gsr = cudf.DataFrame(data) - if psr.shape[0] * psr.shape[1] < min_count: + if PANDAS_GE_120 and psr.shape[0] * psr.shape[1] < min_count: pytest.xfail("https://github.com/pandas-dev/pandas/issues/39738") assert_eq( @@ -4065,7 +4065,14 @@ def test_isin_datetime(data, values): ["this", "is"], [None, None, None], ["12", "14", "19"], - [12, 14, 19], + pytest.param( + [12, 14, 19], + marks=pytest.mark.xfail( + not PANDAS_GE_120, + reason="pandas's failure here seems like a bug(in < 1.2) " + "given the reverse succeeds", + ), + ), ["is", "this", "is", "this", "is"], ], ) @@ -4286,7 +4293,14 @@ def test_isin_dataframe(data, values): rfunc_args_and_kwargs=([values],), ) else: - expected = pdf.isin(values) + try: + expected = pdf.isin(values) + except ValueError as e: + if str(e) == "Lengths must match.": + pytest.xfail( + not PANDAS_GE_110, + "https://github.com/pandas-dev/pandas/issues/34256", + ) if isinstance(values, (pd.DataFrame, pd.Series)): values = cudf.from_pandas(values) @@ -4906,7 +4920,12 @@ def test_rowwise_ops_datetime_dtypes_pdbug(data): expected = pdf.max(axis=1, skipna=False) got = gdf.max(axis=1, skipna=False) - assert_eq(got, expected) + if PANDAS_GE_120: + assert_eq(got, expected) + else: + # PANDAS BUG: https://github.com/pandas-dev/pandas/issues/36907 + with pytest.raises(AssertionError, match="numpy array are different"): + assert_eq(got, expected) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 000bd87803d..127d198d61e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -12,6 +12,7 @@ import cudf from cudf.core import DataFrame +from cudf.core._compat import PANDAS_GE_110 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, @@ -798,6 +799,17 @@ def test_index_difference(data, other, sort): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) + if ( + gd_data.dtype.kind == "f" + and gd_other.dtype.kind != "f" + or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") + ): + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", + ) + expected = pd_data.difference(pd_other, sort=sort) actual = gd_data.difference(gd_other, sort=sort) assert_eq(expected, actual) @@ -856,6 +868,15 @@ def test_index_equals(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) + if ( + gd_data.dtype.kind == "f" or gd_other.dtype.kind == "f" + ) and cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", + ) + expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) @@ -902,6 +923,17 @@ def test_index_categories_equal(data, other): gd_data = cudf.core.index.as_index(data).astype("category") gd_other = cudf.core.index.as_index(other) + if ( + gd_data.dtype.kind == "f" + and gd_other.dtype.kind != "f" + or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") + ): + pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="Bug in Pandas: " + "https://github.com/pandas-dev/pandas/issues/35217", + ) + expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 6e33b1421c8..6921ac3fa35 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -9,7 +9,7 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.tests import utils from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal @@ -975,6 +975,10 @@ def test_series_setitem_datetime(): assert_eq(psr, gsr) +@pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="Pandas will coerce to object datatype here", +) def test_series_setitem_datetime_coerced(): psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") gsr = cudf.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d7735f9029f..f8af320eb84 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -6,6 +6,7 @@ import cudf from cudf.core import DataFrame, Series +from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype from cudf.tests.utils import ( INTEGER_TYPES, @@ -540,6 +541,11 @@ def test_empty_joins(how, left_empty, right_empty): assert len(expected) == len(result) +@pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="left_on/right_on produces undefined results with 0" + "index and is disabled", +) def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 791598110df..e0a922f35fe 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_110 from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq @@ -133,7 +134,18 @@ def test_json_writer(tmpdir, pdf, gdf): assert os.path.exists(pdf_series_fname) assert os.path.exists(gdf_series_fname) - expect_series = pd.read_json(pdf_series_fname, typ="series") + try: + # xref 'https://github.com/pandas-dev/pandas/pull/33373' + expect_series = pd.read_json(pdf_series_fname, typ="series") + except TypeError as e: + if ( + not PANDAS_GE_110 + and str(e) == " is not convertible to datetime" + ): + continue + else: + raise e + got_series = pd.read_json(gdf_series_fname, typ="series") assert_eq(expect_series, got_series) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index a70dd7f4024..f4cdf619212 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -6,6 +6,7 @@ import cudf from cudf import Series +from cudf.core._compat import PANDAS_GE_100 from cudf.tests.utils import assert_eq @@ -89,6 +90,10 @@ def test_can_cast_safely_mixed_kind(): assert not data.can_cast_safely(to_dtype) +@pytest.mark.xfail( + condition=not PANDAS_GE_100, + reason="cuDF null <-> pd.NA compatibility not yet supported", +) def test_to_pandas_nullable_integer(): gsr_not_null = Series([1, 2, 3]) gsr_has_null = Series([1, 2, None]) @@ -100,6 +105,10 @@ def test_to_pandas_nullable_integer(): assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) +@pytest.mark.xfail( + condition=not PANDAS_GE_100, + reason="cuDF null <-> pd.NA compatibility not yet supported", +) def test_to_pandas_nullable_bool(): gsr_not_null = Series([True, False, True]) gsr_has_null = Series([True, False, None]) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 5e90c2348e4..a8196c596f0 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -9,6 +9,7 @@ import cudf from cudf import melt as cudf_melt from cudf.core import DataFrame +from cudf.core._compat import PANDAS_GE_120 from cudf.tests.utils import ( ALL_TYPES, DATETIME_TYPES, @@ -73,7 +74,16 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( - "dtype", list(NUMERIC_TYPES + DATETIME_TYPES) + ["str"], + "dtype", + list(NUMERIC_TYPES + DATETIME_TYPES) + + [ + pytest.param( + "str", + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), + ) + ], ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 2d4791f541c..57661511f5b 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_120 from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -19,6 +20,11 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value): assert_eq(df, gdf) +@pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="pandas incorrectly adds nulls with dataframes " + "but works fine with scalers", +) def test_dataframe_setitem_scaler_bool(): df = pd.DataFrame({"a": [1, 2, 3]}) df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index f98f897ef72..13501d97405 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -13,6 +13,7 @@ import cudf from cudf import concat from cudf.core import DataFrame, Series +from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index from cudf.tests.utils import ( @@ -341,8 +342,20 @@ def _cat_convert_seq_to_cudf(others): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + pytest.param( + pd.Index(["f", "g", "h", "i", "j"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), + pytest.param( + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -367,26 +380,38 @@ def _cat_convert_seq_to_cudf(others): pd.Series(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), ), - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), + pytest.param( + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), + pytest.param( + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], [ pd.Series(["hello", "world", "abc", "xyz", "pqr"]), pd.Series(["abc", "xyz", "hello", "pqr", "world"]), @@ -488,8 +513,20 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + pytest.param( + pd.Index(["f", "g", "h", "i", "j"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), + pytest.param( + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -502,26 +539,38 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["f", "g", "h", "i", "j"]), ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), + pytest.param( + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), + pytest.param( + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], [ pd.Series( ["hello", "world", "abc", "xyz", "pqr"], @@ -580,8 +629,20 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): None, ["f", "g", "h", "i", "j"], pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + pytest.param( + pd.Index(["f", "g", "h", "i", "j"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), + pytest.param( + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/33436", + ), + ), [ np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index d55bc533ba8..3efc30af01e 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -10,6 +10,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_120 from cudf.tests import utils as utils from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -421,7 +422,13 @@ def test_timedelta_dataframe_ops(df, op): np.timedelta64(4, "s"), np.timedelta64(456, "D"), np.timedelta64(46, "h"), - np.timedelta64("nat"), + pytest.param( + np.timedelta64("nat"), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", + ), + ), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -430,7 +437,20 @@ def test_timedelta_dataframe_ops(df, op): ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) @pytest.mark.parametrize( - "op", ["add", "sub", "truediv", "mod", "floordiv"], + "op", + [ + "add", + "sub", + "truediv", + "mod", + pytest.param( + "floordiv", + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", + ), + ), + ], ) def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): gsr = cudf.Series(data=data, dtype=dtype) @@ -504,7 +524,13 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), - np.timedelta64("nat", "s"), + pytest.param( + np.timedelta64("nat", "s"), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", + ), + ), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -514,7 +540,20 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) @pytest.mark.parametrize( - "op", ["add", "sub", "truediv", "mod", "floordiv"], + "op", + [ + "add", + "sub", + "truediv", + "mod", + pytest.param( + "floordiv", + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, + reason="https://github.com/pandas-dev/pandas/issues/35529", + ), + ), + ], ) def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): gpu_scalar = cudf.Scalar(cpu_scalar) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 2bb80b85568..f8ed00beb4f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -10,6 +10,7 @@ import dask_cudf import cudf +from cudf.core._compat import PANDAS_GE_120 @pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"]) @@ -127,8 +128,18 @@ def test_groupby_std(func): @pytest.mark.parametrize( "func", [ - pytest.param(lambda df: df.groupby(["a", "b"]).x.sum()), - pytest.param(lambda df: df.groupby(["a", "b"]).sum()), + pytest.param( + lambda df: df.groupby(["a", "b"]).x.sum(), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), + ), + pytest.param( + lambda df: df.groupby(["a", "b"]).sum(), + marks=pytest.mark.xfail( + condition=not PANDAS_GE_120, reason="pandas bug" + ), + ), pytest.param( lambda df: df.groupby(["a", "b"]).agg({"x", "sum"}), marks=pytest.mark.xfail, From 8d079f0375c37fd410198edb6eea4636cd097560 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 13:05:00 -0800 Subject: [PATCH 18/35] add check_order flag --- python/cudf/cudf/testing/testing.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 2f9a78aab78..c9b519ed1e9 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -231,6 +231,7 @@ def assert_index_equal( check_less_precise: Union[bool, int] = False, check_exact: bool = True, check_categorical: bool = True, + check_order: bool = True, rtol: float = 1e-5, atol: float = 1e-8, obj: str = "Index", @@ -260,6 +261,13 @@ def assert_index_equal( Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. + check_order : bool, default True + Whether to compare the order of index entries as + well as their values. + If True, both indexes must contain the same elements, + in the same order. + If False, both indexes must contain the same elements, + but in any order. rtol : float, default 1e-5 Relative tolerance. Only used when `check_exact` is False. atol : float, default 1e-8 @@ -310,6 +318,11 @@ def assert_index_equal( obj, "lengths are different", f"{len(left)}", f"{len(right)}" ) + # If order doesn't matter then sort the index entries + if not check_order: + left = left.sort_values() + right = right.sort_values() + if isinstance(left, cudf.MultiIndex): if left.nlevels != right.nlevels: raise AssertionError( @@ -328,6 +341,7 @@ def assert_index_equal( exact=check_exact, check_names=check_names, check_exact=check_exact, + check_order=check_order, rtol=rtol, atol=atol, obj=mul_obj, From d8ff5349d51f6c27479b6651fa3f3dddc7b63c5b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 13:22:22 -0800 Subject: [PATCH 19/35] remove version for cudf apis --- python/cudf/cudf/testing/testing.py | 140 ++++++++++------------------ 1 file changed, 48 insertions(+), 92 deletions(-) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index c9b519ed1e9..ec1af0b7321 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -334,28 +334,18 @@ def assert_index_equal( llevel = cudf.Index(left._columns[level], name=left.names[level]) rlevel = cudf.Index(right._columns[level], name=right.names[level]) mul_obj = f"MultiIndex level [{level}]" - if PANDAS_GE_110: - assert_index_equal( - llevel, - rlevel, - exact=check_exact, - check_names=check_names, - check_exact=check_exact, - check_order=check_order, - rtol=rtol, - atol=atol, - obj=mul_obj, - ) - else: - assert_index_equal( - llevel, - rlevel, - exact=check_exact, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - obj=mul_obj, - ) + assert_index_equal( + llevel, + rlevel, + exact=check_exact, + check_names=check_names, + check_exact=check_exact, + check_less_precise=check_less_precise, + check_order=check_order, + rtol=rtol, + atol=atol, + obj=mul_obj, + ) return assert_column_equal( @@ -472,55 +462,32 @@ def assert_series_equal( raise_assert_detail(obj, "Series length are different", msg1, msg2) # index comparison - if PANDAS_GE_110: - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - else: - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) - if PANDAS_GE_110: - assert_column_equal( - left._column, - right._column, - check_dtype=check_dtype, - check_column_type=check_series_type, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - check_category_order=check_category_order, - rtol=rtol, - atol=atol, - ) - else: - assert_column_equal( - left._column, - right._column, - check_dtype=check_dtype, - check_column_type=check_series_type, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) + assert_column_equal( + left._column, + right._column, + check_dtype=check_dtype, + check_column_type=check_series_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + check_category_order=check_category_order, + rtol=rtol, + atol=atol, + ) # metadata comparison if check_names and (left.name != right.name): @@ -695,25 +662,14 @@ def assert_frame_equal( ) for col in left.columns: - if PANDAS_GE_110: - assert_column_equal( - left._data[col], - right._data[col], - check_dtype=check_dtype, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f'Column name="{col}"', - ) - else: - assert_column_equal( - left._data[col], - right._data[col], - check_dtype=check_dtype, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - obj=f'Column name="{col}"', - ) + assert_column_equal( + left._data[col], + right._data[col], + check_dtype=check_dtype, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f'Column name="{col}"', + ) From a0637b9727f9a6cf9d8e26dbbd55826152b3bcc4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 13:35:35 -0800 Subject: [PATCH 20/35] make importing cudf uniform in pytests --- python/cudf/cudf/tests/test_categorical.py | 102 +++++++++--------- .../dask_cudf/tests/test_reductions.py | 10 +- 2 files changed, 55 insertions(+), 57 deletions(-) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index d4dca164992..4be0475a4a3 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -6,10 +6,8 @@ import pandas as pd import pytest -import cudf as gd -from cudf.core import DataFrame, Series +import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.core.index import as_index from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -22,10 +20,10 @@ def pd_str_cat(): def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - cudf_cat = as_index(cat) + cudf_cat = cudf.Index(cat) pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) - sr = Series(cat, index=["p", "q", "r", "s", "t"]) + sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) # Test attributes @@ -53,7 +51,7 @@ def test_categorical_integer(): pytest.xfail(reason="pandas >=1.1 required") cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) np.testing.assert_array_equal( cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_array() ) @@ -81,7 +79,7 @@ def test_categorical_compare_unordered(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) # test equal out = sr == sr @@ -112,12 +110,12 @@ def test_categorical_compare_ordered(): ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True ) pdsr1 = pd.Series(cat1) - sr1 = Series(cat1) + sr1 = cudf.Series(cat1) cat2 = pd.Categorical( ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True ) pdsr2 = pd.Series(cat2) - sr2 = Series(cat2) + sr2 = cudf.Series(cat2) # test equal out = sr1 == sr1 @@ -142,7 +140,7 @@ def test_categorical_compare_ordered(): def test_categorical_binary_add(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_exceptions_equal( lfunc=operator.add, @@ -157,7 +155,7 @@ def test_categorical_binary_add(): def test_categorical_unary_ceil(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_exceptions_equal( lfunc=getattr, @@ -176,7 +174,7 @@ def test_categorical_element_indexing(): """ cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) assert_eq(pdsr, sr) assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) @@ -188,7 +186,7 @@ def test_categorical_masking(): """ cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) # check scalar comparison expect_matches = pdsr == "a" @@ -208,7 +206,7 @@ def test_categorical_masking(): def test_df_cat_set_index(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a") @@ -220,7 +218,7 @@ def test_df_cat_set_index(): def test_df_cat_sort_index(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) @@ -231,7 +229,7 @@ def test_df_cat_sort_index(): def test_cat_series_binop_error(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) @@ -273,8 +271,8 @@ def test_categorical_unique(num_elements): ) # gdf - gdf = DataFrame() - gdf["a"] = Series.from_categorical(pd_cat) + gdf = cudf.DataFrame() + gdf["a"] = cudf.Series.from_categorical(pd_cat) gdf_unique_sorted = np.sort(gdf["a"].unique().to_pandas()) # pandas @@ -300,8 +298,8 @@ def test_categorical_unique_count(nelem): ) # gdf - gdf = DataFrame() - gdf["a"] = Series.from_categorical(pd_cat) + gdf = cudf.DataFrame() + gdf["a"] = cudf.Series.from_categorical(pd_cat) gdf_unique_count = gdf["a"].nunique() # pandas @@ -316,7 +314,7 @@ def test_categorical_unique_count(nelem): def test_categorical_empty(): cat = pd.Categorical([]) pdsr = pd.Series(cat) - sr = Series(cat) + sr = cudf.Series(cat) np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array()) # Test attributes @@ -331,7 +329,7 @@ def test_categorical_empty(): def test_categorical_set_categories(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) psr = pd.Series(cat) - sr = Series.from_categorical(cat) + sr = cudf.Series.from_categorical(cat) # adding category expect = psr.cat.set_categories(["a", "b", "c", "d"]) @@ -349,7 +347,7 @@ def test_categorical_set_categories_preserves_order(): # reassigning categories should preserve element ordering assert_eq( series.cat.set_categories([1, 2]), - Series(series).cat.set_categories([1, 2]), + cudf.Series(series).cat.set_categories([1, 2]), ) @@ -357,7 +355,7 @@ def test_categorical_set_categories_preserves_order(): def test_categorical_as_ordered(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(False)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) assert cd_sr.cat.ordered is False assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -376,7 +374,7 @@ def test_categorical_as_ordered(pd_str_cat, inplace): def test_categorical_as_unordered(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(True)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) assert cd_sr.cat.ordered is True assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -399,7 +397,7 @@ def test_categorical_reorder_categories( ): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) - cd_sr = gd.Series(pd_str_cat.copy().set_ordered(from_ordered)) + cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) assert_eq(pd_sr, cd_sr) @@ -421,7 +419,7 @@ def test_categorical_reorder_categories( def test_categorical_add_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = gd.Series(pd_str_cat.copy()) + cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) @@ -442,7 +440,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): def test_categorical_remove_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = gd.Series(pd_str_cat.copy()) + cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) @@ -470,7 +468,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace): def test_categorical_dataframe_slice_copy(): pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) exp = pdf[1:].copy() gdf = gdf[1:].copy() @@ -511,7 +509,7 @@ def test_categorical_dataframe_slice_copy(): ) def test_categorical_typecast(data, cat_type): pd_data = data.copy() - gd_data = gd.from_pandas(data) + gd_data = cudf.from_pandas(data) assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) @@ -545,7 +543,7 @@ def test_categorical_typecast(data, cat_type): ) def test_categorical_set_categories_categoricals(data, new_categories): pd_data = data.copy().astype("category") - gd_data = gd.from_pandas(pd_data) + gd_data = cudf.from_pandas(pd_data) assert_eq( pd_data.cat.set_categories(new_categories=new_categories), @@ -557,7 +555,7 @@ def test_categorical_set_categories_categoricals(data, new_categories): new_categories=pd.Series(new_categories, dtype="category") ), gd_data.cat.set_categories( - new_categories=gd.Series(new_categories, dtype="category") + new_categories=cudf.Series(new_categories, dtype="category") ), ) @@ -590,14 +588,14 @@ def test_categorical_set_categories_categoricals(data, new_categories): ) def test_categorical_creation(data, dtype): expected = pd.Series(data, dtype=dtype) - got = gd.Series(data, dtype=dtype) + got = cudf.Series(data, dtype=dtype) assert_eq(expected, got) - got = gd.Series(data, dtype=gd.from_pandas(dtype)) + got = cudf.Series(data, dtype=cudf.from_pandas(dtype)) assert_eq(expected, got) expected = pd.Series(data, dtype="category") - got = gd.Series(data, dtype="category") + got = cudf.Series(data, dtype="category") assert_eq(expected, got) @@ -613,33 +611,33 @@ def test_categorical_creation(data, dtype): @pytest.mark.parametrize("ordered", [True, False]) def test_categorical_dtype(categories, ordered): expected = pd.CategoricalDtype(categories=categories, ordered=ordered) - got = gd.CategoricalDtype(categories=categories, ordered=ordered) + got = cudf.CategoricalDtype(categories=categories, ordered=ordered) assert_eq(expected, got) @pytest.mark.parametrize( ("data", "expected"), [ - (gd.Series([1]), np.uint8), - (gd.Series([1, None]), np.uint8), - (gd.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), + (cudf.Series([1]), np.uint8), + (cudf.Series([1, None]), np.uint8), + (cudf.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), ( - gd.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), np.uint8, ), - (gd.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), + (cudf.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), ( - gd.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), np.uint16, ), - (gd.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), + (cudf.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), ( - gd.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), np.uint8, ), - (gd.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), + (cudf.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), ( - gd.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), + cudf.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), np.uint16, ), ], @@ -664,7 +662,7 @@ def test_astype_dtype(data, expected): ) def test_add_categories(data, add): pds = pd.Series(data, dtype="category") - gds = gd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") expected = pds.cat.add_categories(add) actual = gds.cat.add_categories(add) @@ -692,7 +690,7 @@ def test_add_categories(data, add): ) def test_add_categories_error(data, add): pds = pd.Series(data, dtype="category") - gds = gd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") assert_exceptions_equal( pds.cat.add_categories, @@ -704,12 +702,12 @@ def test_add_categories_error(data, add): def test_add_categories_mixed_error(): - gds = gd.Series(["a", "bd", "ef"], dtype="category") + gds = cudf.Series(["a", "bd", "ef"], dtype="category") with pytest.raises(TypeError): gds.cat.add_categories([1, 2, 3]) - gds = gd.Series([1, 2, 3], dtype="category") + gds = cudf.Series([1, 2, 3], dtype="category") with pytest.raises(TypeError): gds.cat.add_categories(["a", "bd", "ef"]) @@ -743,7 +741,7 @@ def test_add_categories_mixed_error(): def test_categorical_assignment(data, cat_dtype): pd_df = pd.DataFrame() pd_df["a"] = np.ones(len(data)) - cd_df = gd.from_pandas(pd_df) + cd_df = cudf.from_pandas(pd_df) pd_cat_series = pd.Series(data, dtype=cat_dtype) # assign categorical series @@ -757,7 +755,7 @@ def test_categorical_assignment(data, cat_dtype): # see issue: https://github.com/rapidsai/cudf/issues/2269 pd_df = pd.DataFrame() pd_df["a"] = np.ones(len(data)) - cd_df = gd.from_pandas(pd_df) + cd_df = cudf.from_pandas(pd_df) pd_categorical = pd.Categorical(data, dtype=cat_dtype) pd_df.assign(cat_col=pd_categorical) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 4da81e4f86c..030b7717fbc 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -8,7 +8,7 @@ import dask_cudf as dgd -import cudf as gd +import cudf def _make_random_frame(nelem, npartitions=2): @@ -18,7 +18,7 @@ def _make_random_frame(nelem, npartitions=2): "y": np.random.normal(size=nelem) + 1, } ) - gdf = gd.DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) dgf = dgd.from_cudf(gdf, npartitions=npartitions) return df, dgf @@ -49,15 +49,15 @@ def test_series_reduce(reducer): @pytest.mark.parametrize( "data", [ - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": "category", "b": int, "c": float, "d": int}, ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": "category", "b": int, "c": float, "d": str}, ), - gd.datasets.randomdata( + cudf.datasets.randomdata( nrows=10000, dtypes={"a": bool, "b": int, "c": float, "d": str} ), ], From b63ae03d30a7403ba43113ad3e89016ebe373371 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 14:51:46 -0800 Subject: [PATCH 21/35] refactor imports to be uniform and less confusing --- python/cudf/cudf/tests/test_categorical.py | 13 +- python/cudf/cudf/tests/test_duplicates.py | 6 +- python/cudf/cudf/tests/test_groupby.py | 10 +- python/cudf/cudf/tests/test_index.py | 17 +- python/cudf/cudf/tests/test_indexing.py | 71 +++--- python/cudf/cudf/tests/test_joining.py | 230 +++++++++--------- python/cudf/cudf/tests/test_numerical.py | 49 ++-- python/cudf/cudf/tests/test_reshape.py | 11 +- python/cudf/cudf/tests/test_sorting.py | 13 +- python/cudf/cudf/tests/test_stats.py | 114 +++++---- python/cudf/cudf/tests/test_string.py | 258 ++++++++++----------- 11 files changed, 393 insertions(+), 399 deletions(-) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 4be0475a4a3..9779fb786f6 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import operator +import string import numpy as np import pandas as pd @@ -259,13 +260,13 @@ def test_cat_series_binop_error(): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_unique(num_elements): - from string import ascii_letters, digits - # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), num_elements), + np.random.choice( + list(string.ascii_letters + string.digits), num_elements + ), dtype="category", ) ) @@ -286,13 +287,13 @@ def test_categorical_unique(num_elements): @pytest.mark.parametrize("nelem", [20, 50, 100]) def test_categorical_unique_count(nelem): - from string import ascii_letters, digits - # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), nelem), + np.random.choice( + list(string.ascii_letters + string.digits), nelem + ), dtype="category", ) ) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index eb8fb1db46f..d429f658451 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,5 +1,8 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import itertools as it +import random + import numpy as np import pytest from pandas import DataFrame, MultiIndex, Series, date_range @@ -277,9 +280,6 @@ def test_drop_duplicates_empty(df): @pytest.mark.parametrize("num_columns", [3, 4, 5]) def test_dataframe_drop_duplicates_numeric_method(num_columns): - import itertools as it - import random - comb = list(it.permutations(range(num_columns), num_columns)) shuf = list(comb) random.Random(num_columns).shuffle(shuf) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 3542a5af537..8011510d340 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,10 +1,12 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +import datetime import itertools import numpy as np import pandas as pd import pytest +from numba import cuda from numpy.testing import assert_array_equal import cudf @@ -284,8 +286,6 @@ def foo(df): def test_groupby_apply_grouped(): - from numba import cuda - np.random.seed(0) df = DataFrame() nelem = 20 @@ -732,12 +732,12 @@ def test_groupby_multi_agg_multi_groupby(): def test_groupby_datetime_multi_agg_multi_groupby(): - from datetime import datetime, timedelta - pdf = pd.DataFrame( { "a": pd.date_range( - datetime.now(), datetime.now() + timedelta(9), freq="D" + datetime.datetime.now(), + datetime.datetime.now() + datetime.timedelta(9), + freq="D", ), "b": np.random.randint(0, 5, 10), "c": np.random.randint(0, 5, 10), diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 127d198d61e..af25b48dd23 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,7 +11,6 @@ import pytest import cudf -from cudf.core import DataFrame from cudf.core._compat import PANDAS_GE_110 from cudf.core.index import ( CategoricalIndex, @@ -35,7 +34,7 @@ def test_df_set_index_from_series(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) @@ -49,7 +48,7 @@ def test_df_set_index_from_series(): def test_df_set_index_from_name(): - df = DataFrame() + df = cudf.DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) @@ -65,7 +64,7 @@ def test_df_set_index_from_name(): def test_df_slice_empty_index(): - df = DataFrame() + df = cudf.DataFrame() assert isinstance(df.index, RangeIndex) assert isinstance(df.index[:1], RangeIndex) with pytest.raises(IndexError): @@ -153,10 +152,10 @@ def test_categorical_index(): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3] pdf["index"] = pd.Categorical(["a", "b", "c"]) - initial_df = DataFrame.from_pandas(pdf) + initial_df = cudf.from_pandas(pdf) pdf = pdf.set_index("index") - gdf1 = DataFrame.from_pandas(pdf) - gdf2 = DataFrame() + gdf1 = cudf.from_pandas(pdf) + gdf2 = cudf.DataFrame() gdf2["a"] = [1, 2, 3] gdf2["index"] = pd.Categorical(["a", "b", "c"]) assert_eq(initial_df.index, gdf2.index) @@ -273,7 +272,7 @@ def test_index_rename_preserves_arg(): def test_set_index_as_property(): - cdf = DataFrame() + cdf = cudf.DataFrame() col1 = np.arange(10) col2 = np.arange(0, 20, 2) cdf["a"] = col1 @@ -1419,7 +1418,7 @@ def test_multiindex_sample_basic(n, frac, replace, axis): "int": [1, 3, 5, 4, 2], }, ) - mul_index = cudf.Index(DataFrame.from_pandas(pdf)) + mul_index = cudf.Index(cudf.from_pandas(pdf)) random_state = 0 try: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 6921ac3fa35..73a074c0376 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -8,7 +8,6 @@ import pytest import cudf -from cudf import DataFrame, Series from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.tests import utils from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal @@ -61,7 +60,11 @@ def pdf_gdf_multi(): pd.Series(range(3, 12)), pd.Series(range(0, 9, 2)), ), - (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))), + ( + cudf.Series(range(12)), + cudf.Series(range(3, 12)), + cudf.Series(range(0, 9, 2)), + ), ( [i in range(12) for i in range(20)], [i in range(3, 12) for i in range(12)], @@ -98,7 +101,7 @@ def pdf_gdf_multi(): ) def test_series_indexing(i1, i2, i3): a1 = np.arange(20) - series = Series(a1) + series = cudf.Series(a1) # Indexing sr1 = series.iloc[i1] assert sr1.null_count == 0 @@ -125,7 +128,7 @@ def test_series_indexing_large_size(): gsr = cudf.Series(cupy.ones(n_elem)) gsr[0] = None got = gsr[gsr.isna()] - expect = Series([None], dtype="float64") + expect = cudf.Series([None], dtype="float64") assert_eq(expect, got) @@ -135,7 +138,7 @@ def test_series_indexing_large_size(): "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] ) def test_series_get_item(psr, arg): - gsr = Series.from_pandas(psr) + gsr = cudf.from_pandas(psr) expect = psr[arg] got = gsr[arg] @@ -144,7 +147,7 @@ def test_series_get_item(psr, arg): def test_dataframe_column_name_indexing(): - df = DataFrame() + df = cudf.DataFrame() data = np.asarray(range(10), dtype=np.int32) df["a"] = data df[1] = data @@ -161,7 +164,7 @@ def test_dataframe_column_name_indexing(): pdf["key2"] = np.random.randint(0, 3, nelem) pdf[1] = np.arange(1, 1 + nelem) pdf[2] = np.random.random(nelem) - df = DataFrame.from_pandas(pdf) + df = cudf.from_pandas(pdf) assert_eq(df[df.columns], df) assert_eq(df[df.columns[:1]], df[["key1"]]) @@ -174,7 +177,7 @@ def test_dataframe_column_name_indexing(): df = pd.DataFrame() for i in range(0, 10): df[i] = range(nelem) - gdf = DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) assert_eq(gdf, df) assert_eq(gdf[gdf.columns], gdf) @@ -182,7 +185,7 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): - df = DataFrame() + df = cudf.DataFrame() size = 123 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( np.int32 @@ -239,7 +242,7 @@ def test_dataframe_loc(scalar, step): } ) - df = DataFrame.from_pandas(pdf) + df = cudf.DataFrame.from_pandas(pdf) assert_eq(df.loc[:, ["a"]], pdf.loc[:, ["a"]]) @@ -311,7 +314,7 @@ def test_dataframe_loc(scalar, step): def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[2], gdf.loc[2]) @@ -325,13 +328,13 @@ def test_dataframe_loc_mask(mask, arg): pdf = pd.DataFrame( {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg]) def test_dataframe_loc_outbound(): - df = DataFrame() + df = cudf.DataFrame() size = 10 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( np.int32 @@ -347,7 +350,7 @@ def test_dataframe_loc_outbound(): def test_series_loc_numerical(): ps = pd.Series([1, 2, 3, 4, 5], index=[5, 6, 7, 8, 9]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc[5], gs.loc[5]) assert_eq(ps.loc[6], gs.loc[6]) @@ -365,7 +368,7 @@ def test_series_loc_numerical(): def test_series_loc_float_index(): ps = pd.Series([1, 2, 3, 4, 5], index=[5.43, 6.34, 7.34, 8.0, 9.1]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc[5.43], gs.loc[5.43]) assert_eq(ps.loc[8], gs.loc[8]) @@ -383,7 +386,7 @@ def test_series_loc_string(): ps = pd.Series( [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["one"], gs.loc["one"]) assert_eq(ps.loc["five"], gs.loc["five"]) @@ -406,7 +409,7 @@ def test_series_loc_datetime(): ps = pd.Series( [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) # a few different ways of specifying a datetime label: assert_eq(ps.loc["20010101"], gs.loc["20010101"]) @@ -467,7 +470,7 @@ def test_series_loc_categorical(): ps = pd.Series( [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) ) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["a"], gs.loc["a"]) assert_eq(ps.loc["e"], gs.loc["e"]) @@ -531,12 +534,12 @@ def test_dataframe_series_loc_multiindex(obj): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_series_iloc(nelem): - # create random series + # create random cudf.Series np.random.seed(12) ps = pd.Series(np.random.sample(nelem)) - # gpu series - gs = Series(ps) + # gpu cudf.Series + gs = cudf.Series(ps) # positive tests for indexing np.testing.assert_allclose(gs.iloc[-1 * nelem], ps.iloc[-1 * nelem]) @@ -567,7 +570,7 @@ def test_series_iloc(nelem): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_dataframe_iloc(nelem): - gdf = DataFrame() + gdf = cudf.DataFrame() gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -619,7 +622,7 @@ def test_dataframe_iloc(nelem): @pytest.mark.xfail(raises=AssertionError, reason="Series.index are different") def test_dataframe_iloc_tuple(): - gdf = DataFrame() + gdf = cudf.DataFrame() nelem = 123 gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -641,7 +644,7 @@ def test_dataframe_iloc_tuple(): raises=IndexError, reason="positional indexers are out-of-bounds" ) def test_dataframe_iloc_index_error(): - gdf = DataFrame() + gdf = cudf.DataFrame() nelem = 123 gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 @@ -662,7 +665,7 @@ def assert_col(g, p): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_dataframe_take(ntake): np.random.seed(0) - df = DataFrame() + df = cudf.DataFrame() nelem = 123 df["ii"] = np.random.randint(0, 20, nelem) @@ -681,7 +684,7 @@ def test_dataframe_take(ntake): @pytest.mark.parametrize("ntake", [1, 2, 8, 9]) def test_dataframe_take_with_multiIndex(ntake): np.random.seed(0) - df = DataFrame( + df = cudf.DataFrame( index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], @@ -707,7 +710,7 @@ def test_series_take(ntake, keep_index): nelem = 123 data = np.random.randint(0, 20, nelem) - sr = Series(data) + sr = cudf.Series(data) take_indices = np.random.randint(0, len(sr), ntake) @@ -725,7 +728,7 @@ def test_series_take(ntake, keep_index): def test_series_take_positional(): psr = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) - gsr = Series.from_pandas(psr) + gsr = cudf.Series.from_pandas(psr) take_indices = [1, 2, 0, 3] @@ -739,7 +742,7 @@ def test_series_take_positional(): @pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) @pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) def test_dataframe_masked_slicing(nelem, slice_start, slice_end): - gdf = DataFrame() + gdf = cudf.DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) @@ -756,13 +759,13 @@ def do_slice(x): def test_dataframe_boolean_mask_with_None(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf_masked = pdf[[True, False, True, False]] gdf_masked = gdf[[True, False, True, False]] assert_eq(pdf_masked, gdf_masked) with pytest.raises(ValueError): - gdf[Series([True, False, None, False])] + gdf[cudf.Series([True, False, None, False])] @pytest.mark.parametrize("dtype", [int, float, str]) @@ -842,12 +845,12 @@ def test_dataframe_apply_boolean_mask(): "c": ["a", None, "b", "c"], } ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) """ -This test compares cudf and Pandas dataframe boolean indexing. +This test compares cudf and Pandas DataFrame boolean indexing. """ @@ -1161,7 +1164,7 @@ def test_sliced_indexing(): a = list(range(4, 4 + 150)) b = list(range(0, 0 + 150)) pdf = pd.DataFrame({"a": a, "b": b}) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf = pdf.set_index("a") gdf = gdf.set_index("a") pidx = pdf.index[:75] diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index f8af320eb84..8692057aa58 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -5,7 +5,6 @@ import pytest import cudf -from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype from cudf.tests.utils import ( @@ -72,7 +71,7 @@ def pd_odd_joins(left, right, join_type): @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): - df = DataFrame() + df = cudf.DataFrame() df["a"] = aa df["b"] = bb @@ -133,8 +132,7 @@ def work_gdf(df): def _check_series(expect, got): magic = 0xDEADBEAF - # print("expect\n", expect) - # print("got\n", got.to_string(nrows=None)) + direct_equal = np.all(expect.values == got.to_array()) nanfilled_equal = np.all( expect.fillna(magic).values == got.fillna(magic).to_array() @@ -148,7 +146,7 @@ def _check_series(expect, got): def test_dataframe_join_suffix(): np.random.seed(0) - df = DataFrame() + df = cudf.DataFrame() for k in "abc": df[k] = np.random.randint(0, 5, 5) @@ -175,12 +173,12 @@ def test_dataframe_join_suffix(): def test_dataframe_join_cats(): - lhs = DataFrame() + lhs = cudf.DataFrame() lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) lhs["b"] = bb = np.arange(len(lhs)) lhs = lhs.set_index("a") - rhs = DataFrame() + rhs = cudf.DataFrame() rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) rhs["c"] = cc = np.arange(len(rhs)) rhs = rhs.set_index("a") @@ -243,8 +241,8 @@ def test_dataframe_join_mismatch_cats(how): pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") - gdf1 = DataFrame.from_pandas(pdf1) - gdf2 = DataFrame.from_pandas(pdf2) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") @@ -274,13 +272,13 @@ def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) @@ -340,13 +338,13 @@ def test_dataframe_merge_on_unknown_column(): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) @@ -361,13 +359,13 @@ def test_dataframe_merge_no_common_column(): np.random.seed(0) # Make cuDF - df_left = DataFrame() + df_left = cudf.DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) - df_right = DataFrame() + df_right = cudf.DataFrame() nelem = 500 df_right["key3"] = np.random.randint(0, 30, nelem) df_right["key4"] = np.random.randint(0, 50, nelem) @@ -379,18 +377,18 @@ def test_dataframe_merge_no_common_column(): def test_dataframe_empty_merge(): - gdf1 = DataFrame({"a": [], "b": []}) - gdf2 = DataFrame({"a": [], "c": []}) + gdf1 = cudf.DataFrame({"a": [], "b": []}) + gdf2 = cudf.DataFrame({"a": [], "c": []}) - expect = DataFrame({"a": [], "b": [], "c": []}) + expect = cudf.DataFrame({"a": [], "b": [], "c": []}) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got) def test_dataframe_merge_order(): - gdf1 = DataFrame() - gdf2 = DataFrame() + gdf1 = cudf.DataFrame() + gdf2 = cudf.DataFrame() gdf1["id"] = [10, 11] gdf1["timestamp"] = [1, 2] gdf1["a"] = [3, 4] @@ -458,8 +456,8 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): pdf_left[left_column] = np.random.randint(0, max, rows) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, max, rows) - gdf_left = DataFrame.from_pandas(pdf_left) - gdf_right = DataFrame.from_pandas(pdf_right) + gdf_left = cudf.from_pandas(pdf_left) + gdf_right = cudf.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): with pytest.raises( pd.core.reshape.merge.MergeError, @@ -494,10 +492,6 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): def test_safe_merging_with_left_empty(): - import numpy as np - import pandas as pd - - from cudf import DataFrame np.random.seed(0) @@ -508,8 +502,8 @@ def test_safe_merging_with_left_empty(): pdf_left[left_column] = np.random.randint(0, 10, 0) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, 10, 5) - gdf_left = DataFrame.from_pandas(pdf_left) - gdf_right = DataFrame.from_pandas(pdf_right) + gdf_left = cudf.from_pandas(pdf_left) + gdf_right = cudf.from_pandas(pdf_right) pdf_result = pdf_left.merge(pdf_right) gdf_result = gdf_left.merge(gdf_right) @@ -551,8 +545,8 @@ def test_merge_left_index_zero(): right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on="y") gd_merge = gleft.merge(gright, left_on="x", right_on="y") @@ -573,8 +567,8 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge) @@ -594,8 +588,8 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs): right = pd.DataFrame( {"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7] ) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge) @@ -672,8 +666,8 @@ def test_merge_on_index_retained(): def test_merge_left_right_index_left_right_on_kwargs2(kwargs): left = pd.DataFrame({"x": [1, 2, 3]}, index=[10, 20, 30]) right = pd.DataFrame({"y": [10, 20, 30]}, index=[1, 2, 30]) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: @@ -707,8 +701,8 @@ def test_merge_sort(ons, hows): left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) @@ -753,8 +747,8 @@ def test_merge_sort_on_indexes(kwargs): left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) if left_index and right_index: @@ -777,8 +771,8 @@ def test_join_datetimes_index(dtype): datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) pdf_rhs = pd.DataFrame({"d": datetimes}) - gdf_lhs = DataFrame.from_pandas(pdf_lhs) - gdf_rhs = DataFrame.from_pandas(pdf_rhs) + gdf_lhs = cudf.from_pandas(pdf_lhs) + gdf_rhs = cudf.from_pandas(pdf_rhs) gdf_rhs["d"] = gdf_rhs["d"].astype(dtype) @@ -793,8 +787,8 @@ def test_join_datetimes_index(dtype): def test_join_with_different_names(): left = pd.DataFrame({"a": [0, 1, 2.0, 3, 4, 5, 9]}) right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"]) assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True)) @@ -803,8 +797,8 @@ def test_join_with_different_names(): def test_join_same_name_different_order(): left = pd.DataFrame({"a": [0, 0], "b": [1, 2]}) right = pd.DataFrame({"a": [1, 2], "b": [0, 0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"]) gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"]) assert_eq( @@ -815,8 +809,8 @@ def test_join_same_name_different_order(): def test_join_empty_table_dtype(): left = pd.DataFrame({"a": []}) right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = DataFrame.from_pandas(left) - gright = DataFrame.from_pandas(right) + gleft = cudf.from_pandas(left) + gright = cudf.from_pandas(right) pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"]) assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype) @@ -917,7 +911,7 @@ def test_join_multi(how, column_a, column_b, column_c): ) def test_merge_multi(kwargs): - left = DataFrame( + left = cudf.DataFrame( { "a": [1, 2, 3, 4, 3, 5, 6], "b": [1, 3, 5, 7, 5, 9, 0], @@ -925,7 +919,7 @@ def test_merge_multi(kwargs): "d": ["v", "w", "x", "y", "z", "1", "2"], } ) - right = DataFrame( + right = cudf.DataFrame( { "a": [0, 9, 3, 4, 3, 7, 8], "b": [2, 4, 5, 7, 5, 6, 8], @@ -981,19 +975,19 @@ def test_merge_multi(kwargs): def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] - join_data_l = Series([1, 2, 3], dtype=dtype_l) - join_data_r = Series([1, 2, 4], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 4], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1011,11 +1005,11 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): def test_typecast_on_join_float_to_float(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) @@ -1026,9 +1020,9 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): exp_join_data = [1, 2, 3, 0.9, 4.5] exp_other_data = ["a", "b", "c", "d", "e"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1052,19 +1046,19 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1081,18 +1075,18 @@ def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_r = cudf.Series([1, 2, 3, 4.01, 4.99], dtype="float32") - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] - exp_join_col = Series(exp_join_data, dtype="float32") + exp_join_col = cudf.Series(exp_join_data, dtype="float32") - expect = DataFrame( + expect = cudf.DataFrame( {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} ) @@ -1145,23 +1139,23 @@ def test_typecast_on_join_overflow_unsafe(dtypes): ) def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series( + join_data_l = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"] ).astype(dtype_l) - join_data_r = Series( + join_data_r = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"] ).astype(dtype_r) - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"] exp_other_data = ["a", "b", "c", "d"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1183,21 +1177,21 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r): pytest.skip("Can't determine which categorical to use") other_data = ["a", "b", "c", "d", "e"] - join_data_l = Series([1, 2, 3, 4, 5], dtype=dtype_l) - join_data_r = Series([1, 2, 3, 4, 6], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype=dtype_l) + join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype=dtype_r) if dtype_l == "category": exp_dtype = join_data_l.dtype.categories.dtype elif dtype_r == "category": exp_dtype = join_data_r.dtype.categories.dtype - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] - exp_join_col = Series(exp_join_data, dtype=exp_dtype) + exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, @@ -1426,8 +1420,8 @@ def test_categorical_typecast_outer_one_cat(dtype): def test_index_join(lhs, rhs, how, level): l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) - l_df = DataFrame.from_pandas(l_pdf) - r_df = DataFrame.from_pandas(r_pdf) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) p_lhs = l_pdf.set_index(lhs).index p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index @@ -1454,8 +1448,8 @@ def test_index_join_corner_cases(): r_pdf = pd.DataFrame( {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} ) - l_df = DataFrame.from_pandas(l_pdf) - r_df = DataFrame.from_pandas(r_pdf) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) # Join when column name doesn't match with level lhs = ["a", "b"] @@ -1529,8 +1523,10 @@ def test_index_join_corner_cases(): def test_index_join_exception_cases(): - l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) + l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) + r_df = cudf.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) # Join between two MultiIndex lhs = ["a", "b"] @@ -1553,12 +1549,12 @@ def test_index_join_exception_cases(): def test_typecast_on_join_indexes(): - join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = Series([1, 2, 3, 4, 6], dtype="int32") + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype="int32") other_data = ["a", "b", "c", "d", "e"] - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") @@ -1566,7 +1562,7 @@ def test_typecast_on_join_indexes(): exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_data, "B_x": exp_other_data, @@ -1581,17 +1577,17 @@ def test_typecast_on_join_indexes(): def test_typecast_on_join_multiindices(): - join_data_l_0 = Series([1, 2, 3, 4, 5], dtype="int8") - join_data_l_1 = Series([2, 3, 4.1, 5.9, 6], dtype="float32") - join_data_l_2 = Series([7, 8, 9, 0, 1], dtype="float32") + join_data_l_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_l_1 = cudf.Series([2, 3, 4.1, 5.9, 6], dtype="float32") + join_data_l_2 = cudf.Series([7, 8, 9, 0, 1], dtype="float32") - join_data_r_0 = Series([1, 2, 3, 4, 5], dtype="int32") - join_data_r_1 = Series([2, 3, 4, 5, 6], dtype="int32") - join_data_r_2 = Series([7, 8, 9, 0, 0], dtype="float64") + join_data_r_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int32") + join_data_r_1 = cudf.Series([2, 3, 4, 5, 6], dtype="int32") + join_data_r_2 = cudf.Series([7, 8, 9, 0, 0], dtype="float64") other_data = ["a", "b", "c", "d", "e"] - gdf_l = DataFrame( + gdf_l = cudf.DataFrame( { "join_col_0": join_data_l_0, "join_col_1": join_data_l_1, @@ -1599,7 +1595,7 @@ def test_typecast_on_join_multiindices(): "B": other_data, } ) - gdf_r = DataFrame( + gdf_r = cudf.DataFrame( { "join_col_0": join_data_r_0, "join_col_1": join_data_r_1, @@ -1611,12 +1607,12 @@ def test_typecast_on_join_multiindices(): gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) - exp_join_data_0 = Series([1, 2], dtype="int32") - exp_join_data_1 = Series([2, 3], dtype="float64") - exp_join_data_2 = Series([7, 8], dtype="float64") - exp_other_data = Series(["a", "b"]) + exp_join_data_0 = cudf.Series([1, 2], dtype="int32") + exp_join_data_1 = cudf.Series([2, 3], dtype="float64") + exp_join_data_2 = cudf.Series([7, 8], dtype="float64") + exp_other_data = cudf.Series(["a", "b"]) - expect = DataFrame( + expect = cudf.DataFrame( { "join_col_0": exp_join_data_0, "join_col_1": exp_join_data_1, @@ -1632,12 +1628,12 @@ def test_typecast_on_join_multiindices(): def test_typecast_on_join_indexes_matching_categorical(): - join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category") - join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str") + join_data_l = cudf.Series(["a", "b", "c", "d", "e"], dtype="category") + join_data_r = cudf.Series(["a", "b", "c", "d", "e"], dtype="str") other_data = [1, 2, 3, 4, 5] - gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") @@ -1645,7 +1641,7 @@ def test_typecast_on_join_indexes_matching_categorical(): exp_join_data = ["a", "b", "c", "d", "e"] exp_other_data = [1, 2, 3, 4, 5] - expect = DataFrame( + expect = cudf.DataFrame( { "join_col": exp_join_data, "B_x": exp_other_data, @@ -1699,9 +1695,9 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): check_lhs = lhs.copy() check_rhs = rhs.copy() - if isinstance(lhs, Series): + if isinstance(lhs, cudf.Series): check_lhs = lhs.to_frame() - if isinstance(rhs, Series): + if isinstance(rhs, cudf.Series): check_rhs = rhs.to_frame() expect = check_lhs.merge(check_rhs, how=how, **kwargs) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index f4cdf619212..6d9bcda2c0b 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,87 +5,88 @@ import pytest import cudf -from cudf import Series from cudf.core._compat import PANDAS_GE_100 from cudf.tests.utils import assert_eq def test_can_cast_safely_same_kind(): # 'i' -> 'i' - data = Series([1, 2, 3], dtype="int32")._column + data = cudf.Series([1, 2, 3], dtype="int32")._column to_dtype = np.dtype("int64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="int64")._column + data = cudf.Series([1, 2, 3], dtype="int64")._column to_dtype = np.dtype("int32") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 2 ** 31], dtype="int64")._column + data = cudf.Series([1, 2, 2 ** 31], dtype="int64")._column assert not data.can_cast_safely(to_dtype) # 'u' -> 'u' - data = Series([1, 2, 3], dtype="uint32")._column + data = cudf.Series([1, 2, 3], dtype="uint32")._column to_dtype = np.dtype("uint64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="uint64")._column + data = cudf.Series([1, 2, 3], dtype="uint64")._column to_dtype = np.dtype("uint32") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 2 ** 33], dtype="uint64")._column + data = cudf.Series([1, 2, 2 ** 33], dtype="uint64")._column assert not data.can_cast_safely(to_dtype) # 'f' -> 'f' - data = Series([np.inf, 1.0], dtype="float64")._column + data = cudf.Series([np.inf, 1.0], dtype="float64")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) - data = Series([np.finfo("float32").max * 2, 1.0], dtype="float64")._column + data = cudf.Series( + [np.finfo("float32").max * 2, 1.0], dtype="float64" + )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) def test_can_cast_safely_mixed_kind(): - data = Series([1, 2, 3], dtype="int32")._column + data = cudf.Series([1, 2, 3], dtype="int32")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = Series([1, 2, 2 ** 24 + 1], dtype="int32")._column + data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="int32")._column assert not data.can_cast_safely(to_dtype) - data = Series([1, 2, 3], dtype="uint32")._column + data = cudf.Series([1, 2, 3], dtype="uint32")._column to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column + data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column assert not data.can_cast_safely(to_dtype) to_dtype = np.dtype("float64") assert data.can_cast_safely(to_dtype) - data = Series([1.0, 2.0, 3.0], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 3.0], dtype="float32")._column to_dtype = np.dtype("int32") assert data.can_cast_safely(to_dtype) # not integer float - data = Series([1.0, 2.0, 3.5], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 3.5], dtype="float32")._column assert not data.can_cast_safely(to_dtype) - data = Series([10.0, 11.0, 2000.0], dtype="float64")._column + data = cudf.Series([10.0, 11.0, 2000.0], dtype="float64")._column assert data.can_cast_safely(to_dtype) # float out of int range - data = Series([1.0, 2.0, 1.0 * (2 ** 31)], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 1.0 * (2 ** 31)], dtype="float32")._column assert not data.can_cast_safely(to_dtype) # negative signed integers casting to unsigned integers - data = Series([-1, 0, 1], dtype="int32")._column + data = cudf.Series([-1, 0, 1], dtype="int32")._column to_dtype = np.dtype("uint32") assert not data.can_cast_safely(to_dtype) @@ -95,8 +96,8 @@ def test_can_cast_safely_mixed_kind(): reason="cuDF null <-> pd.NA compatibility not yet supported", ) def test_to_pandas_nullable_integer(): - gsr_not_null = Series([1, 2, 3]) - gsr_has_null = Series([1, 2, None]) + gsr_not_null = cudf.Series([1, 2, 3]) + gsr_has_null = cudf.Series([1, 2, None]) psr_not_null = pd.Series([1, 2, 3], dtype="int64") psr_has_null = pd.Series([1, 2, None], dtype="Int64") @@ -110,8 +111,8 @@ def test_to_pandas_nullable_integer(): reason="cuDF null <-> pd.NA compatibility not yet supported", ) def test_to_pandas_nullable_bool(): - gsr_not_null = Series([True, False, True]) - gsr_has_null = Series([True, False, None]) + gsr_not_null = cudf.Series([True, False, True]) + gsr_has_null = cudf.Series([True, False, None]) psr_not_null = pd.Series([True, False, True], dtype="bool") psr_has_null = pd.Series([True, False, None], dtype="boolean") @@ -121,12 +122,12 @@ def test_to_pandas_nullable_bool(): def test_can_cast_safely_has_nulls(): - data = Series([1, 2, 3, None], dtype="float32")._column + data = cudf.Series([1, 2, 3, None], dtype="float32")._column to_dtype = np.dtype("int64") assert data.can_cast_safely(to_dtype) - data = Series([1, 2, 3.1, None], dtype="float32")._column + data = cudf.Series([1, 2, 3.1, None], dtype="float32")._column assert not data.can_cast_safely(to_dtype) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index a8196c596f0..b030924779d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -8,7 +8,6 @@ import cudf from cudf import melt as cudf_melt -from cudf.core import DataFrame from cudf.core._compat import PANDAS_GE_120 from cudf.tests.utils import ( ALL_TYPES, @@ -56,7 +55,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): pdf[colname] = data value_vars.append(colname) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars) got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars) @@ -101,7 +100,7 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = gdf.stack() @@ -137,7 +136,7 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) if dtype == "category": with pytest.raises(ValueError): @@ -176,7 +175,7 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): data[idx] = np.nan pdf[colname] = data - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) got = gdf.tile(count) expect = pd.DataFrame(pd.concat([pdf] * count)) @@ -356,7 +355,7 @@ def test_series_merge_sorted(nparts, key, na_position, ascending): ) def test_pivot_simple(index, column, data): pdf = pd.DataFrame({"index": index, "column": column, "data": data}) - gdf = cudf.DataFrame.from_pandas(pdf) + gdf = cudf.from_pandas(pdf) expect = pdf.pivot("index", "column") got = gdf.pivot("index", "column") diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index e30194e9eda..b90aebc33dc 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -1,5 +1,6 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +import string from itertools import product import numpy as np @@ -225,14 +226,12 @@ def test_dataframe_multi_column( num_cols, num_rows, dtype, ascending, na_position ): - from string import ascii_lowercase - np.random.seed(0) - by = list(ascii_lowercase[:num_cols]) + by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) pdf[colname] = data @@ -256,14 +255,12 @@ def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): - from string import ascii_lowercase - np.random.seed(0) - by = list(ascii_lowercase[:num_cols]) + by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): - colname = ascii_lowercase[i] + colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 1512c87d160..1eae8ddbf1e 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,13 +1,13 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import re +from concurrent.futures import ThreadPoolExecutor import numpy as np import pandas as pd import pytest import cudf -from cudf.core import Series from cudf.datasets import randomdata from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -32,7 +32,7 @@ def test_series_reductions(method, dtype, skipna): arr = arr.astype(dtype) if dtype in (np.float32, np.float64): arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) psr = sr.to_pandas() psr[~mask] = np.nan @@ -44,18 +44,16 @@ def call_test(sr, skipna): return fn(skipna=skipna) expect, got = call_test(psr, skipna=skipna), call_test(sr, skipna=skipna) - print(expect, got) + np.testing.assert_approx_equal(expect, got) @pytest.mark.parametrize("method", methods) def test_series_reductions_concurrency(method): - from concurrent.futures import ThreadPoolExecutor - e = ThreadPoolExecutor(10) np.random.seed(0) - srs = [Series(np.random.random(10000)) for _ in range(1)] + srs = [cudf.Series(np.random.random(10000)) for _ in range(1)] def call_test(sr): fn = getattr(sr, method) @@ -74,7 +72,7 @@ def f(sr): def test_series_std(ddof): np.random.seed(0) arr = np.random.random(100) - 0.5 - sr = Series(arr) + sr = cudf.Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) expect = pd.std(ddof=ddof) @@ -85,7 +83,7 @@ def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array()) assert len(set(arr[mask])) == sr.nunique() @@ -97,13 +95,13 @@ def test_series_unique(): def test_series_nunique(nan_as_null, dropna): # We remove nulls as opposed to NaNs using the dropna parameter, # so to test against pandas we replace NaN with another discrete value - cudf_series = Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) + cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) pd_series = pd.Series([1, 2, 2, 3, 3]) expect = pd_series.nunique(dropna=dropna) got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = Series( + cudf_series = cudf.Series( [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null ) if nan_as_null is True: @@ -115,7 +113,7 @@ def test_series_nunique(nan_as_null, dropna): got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) + cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) if nan_as_null is True: pd_series = pd.Series([1.0, np.nan, np.nan]) else: @@ -127,7 +125,7 @@ def test_series_nunique(nan_as_null, dropna): def test_series_scale(): arr = pd.Series(np.random.randint(low=-10, high=10, size=100)) - sr = Series(arr) + sr = cudf.Series(arr) vmin = arr.min() vmax = arr.max() @@ -143,7 +141,7 @@ def test_exact_quantiles(int_method): quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True @@ -162,7 +160,7 @@ def test_exact_quantiles_int(int_method): quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True @@ -180,7 +178,7 @@ def test_approx_quantiles(): arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) pdf_series = pd.Series(arr) q1 = gdf_series.quantile(quant_values, exact=False) @@ -194,7 +192,7 @@ def test_approx_quantiles_int(): quant_values = [0.5] approx_results = [2] - gdf_series = Series(arr) + gdf_series = cudf.Series(arr) q1 = gdf_series.quantile(quant_values, exact=False) @@ -206,7 +204,7 @@ def test_approx_quantiles_int(): def test_misc_quantiles(data, q): pdf_series = cudf.utils.utils.create_pandas_series(data=data) - gdf_series = Series(data) + gdf_series = cudf.Series(data) expected = pdf_series.quantile(q) actual = gdf_series.quantile(q) @@ -216,17 +214,17 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - Series(np.random.normal(-100, 100, 1000)), - Series(np.random.randint(-50, 50, 1000)), - Series(np.zeros(100)), - Series(np.repeat(np.nan, 100)), - Series(np.array([1.123, 2.343, np.nan, 0.0])), - Series( + cudf.Series(np.random.normal(-100, 100, 1000)), + cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.zeros(100)), + cudf.Series(np.repeat(np.nan, 100)), + cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), + cudf.Series( [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), - Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - Series([]), - Series([-3]), + cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), + cudf.Series([]), + cudf.Series([-3]), randomdata( nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} ), @@ -257,17 +255,17 @@ def test_kurtosis(data, null_flag): @pytest.mark.parametrize( "data", [ - Series(np.random.normal(-100, 100, 1000)), - Series(np.random.randint(-50, 50, 1000)), - Series(np.zeros(100)), - Series(np.repeat(np.nan, 100)), - Series(np.array([1.123, 2.343, np.nan, 0.0])), - Series( + cudf.Series(np.random.normal(-100, 100, 1000)), + cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.zeros(100)), + cudf.Series(np.repeat(np.nan, 100)), + cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), + cudf.Series( [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False ), - Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - Series([]), - Series([-3]), + cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), + cudf.Series([]), + cudf.Series([-3]), randomdata( nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} ), @@ -300,13 +298,13 @@ def test_series_median(dtype, num_na): mask = np.arange(100) >= num_na arr = arr.astype(dtype) - sr = Series.from_masked_array(arr, Series(mask).as_mask()) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) actual = sr.median(skipna=True) desired = ps.median(skipna=True) - print(actual, desired) + np.testing.assert_approx_equal(actual, desired) # only for float until integer null supported convert to pandas in cudf @@ -326,10 +324,10 @@ def test_series_median(dtype, num_na): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - Series([]), - Series([-3]), + cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), + cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), + cudf.Series([]), + cudf.Series([-3]), ], ) @pytest.mark.parametrize( @@ -340,13 +338,13 @@ def test_series_median(dtype, num_na): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - Series([5]), + cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + cudf.Series([5]), ], ) def test_cov1d(data1, data2): - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() @@ -364,10 +362,10 @@ def test_cov1d(data1, data2): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - Series([]), - Series([-3]), + cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), + cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), + cudf.Series([]), + cudf.Series([-3]), ], ) @pytest.mark.parametrize( @@ -378,13 +376,13 @@ def test_cov1d(data1, data2): np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), - Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - Series([5]), + cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + cudf.Series([5]), ], ) def test_corr1d(data1, data2): - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() @@ -436,13 +434,13 @@ def test_df_corr(): @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): psr = cudf.utils.utils.create_pandas_series(data=data) - gsr = Series(data) + gsr = cudf.Series(data) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) psr = cudf.utils.utils.create_pandas_series(data=data) - gsr = Series(data, nan_as_null=False) + gsr = cudf.Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` @@ -462,7 +460,7 @@ def test_nans_stats(data, ops, skipna): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) def test_min_count_ops(data, ops, skipna, min_count): psr = pd.Series(data) - gsr = Series(data) + gsr = cudf.Series(data) assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), @@ -473,8 +471,8 @@ def test_min_count_ops(data, ops, skipna, min_count): @pytest.mark.parametrize( "gsr", [ - Series([1, 2, 3, 4], dtype="datetime64[ns]"), - Series([1, 2, 3, 4], dtype="timedelta64[ns]"), + cudf.Series([1, 2, 3, 4], dtype="datetime64[ns]"), + cudf.Series([1, 2, 3, 4], dtype="timedelta64[ns]"), ], ) def test_cov_corr_invalid_dtypes(gsr): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 13501d97405..a015f3387b4 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import re +import urllib.parse from contextlib import ExitStack as does_not_raise from sys import getsizeof @@ -12,7 +13,6 @@ import cudf from cudf import concat -from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index @@ -57,7 +57,7 @@ def index(request): @pytest.fixture def ps_gs(data, index): ps = pd.Series(data, index=index, dtype="str", name="nice name") - gs = Series(data, index=index, dtype="str", name="nice name") + gs = cudf.Series(data, index=index, dtype="str", name="nice name") return (ps, gs) @@ -65,7 +65,7 @@ def ps_gs(data, index): def test_string_ingest(construct): expect = ["a", "a", "b", "c", "a"] data = construct(expect) - got = Series(data) + got = cudf.Series(data) assert got.dtype == np.dtype("object") assert len(got) == 5 for idx, val in enumerate(expect): @@ -106,7 +106,7 @@ def test_string_get_item(ps_gs, item): ps, gs = ps_gs got = gs.iloc[item] - if isinstance(got, Series): + if isinstance(got, cudf.Series): got = got.to_arrow() if isinstance(item, cupy.ndarray): @@ -140,7 +140,7 @@ def test_string_bool_mask(ps_gs, item): ps, gs = ps_gs got = gs.iloc[item] - if isinstance(got, Series): + if isinstance(got, cudf.Series): got = got.to_arrow() if isinstance(item, cupy.ndarray): @@ -196,7 +196,7 @@ def test_string_astype(dtype): elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) # Pandas str --> bool typecasting always returns True if there's a string if dtype.startswith("bool"): @@ -214,7 +214,7 @@ def test_string_astype(dtype): def test_string_empty_astype(dtype): data = [] ps = pd.Series(data, dtype="str") - gs = Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") expect = ps.astype(dtype) got = gs.astype(dtype) @@ -245,7 +245,7 @@ def test_string_numeric_astype(dtype): if not dtype.startswith("datetime64"): ps = pd.Series(data, dtype=dtype) - gs = Series(data, dtype=dtype) + gs = cudf.Series(data, dtype=dtype) expect = pd.Series(ps.astype("str")) got = gs.astype("str") @@ -261,7 +261,7 @@ def test_string_empty_numeric_astype(dtype): ps = pd.Series(data, dtype="datetime64[ns]") else: ps = pd.Series(data, dtype=dtype) - gs = Series(data, dtype=dtype) + gs = cudf.Series(data, dtype=dtype) expect = ps.astype("str") got = gs.astype("str") @@ -276,8 +276,8 @@ def test_string_concat(): ps1 = pd.Series(data1, index=index) ps2 = pd.Series(data2, index=index) - gs1 = Series(data1, index=index) - gs2 = Series(data2, index=index) + gs1 = cudf.Series(data1, index=index) + gs2 = cudf.Series(data2, index=index) expect = pd.concat([ps1, ps2]) got = concat([gs1, gs2]) @@ -855,7 +855,7 @@ def test_string_upper(ps_gs): @pytest.mark.parametrize("expand", [True, False, None]) def test_string_split(data, pat, n, expand): ps = pd.Series(data, dtype="str") - gs = Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") expect = ps.str.split(pat=pat, n=n, expand=expand) got = gs.str.split(pat=pat, n=n, expand=expand) @@ -877,10 +877,10 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -920,18 +920,18 @@ def test_string_join_key_nulls(str_data_nulls): other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = Series(str_data, dtype="str") + gdf["key"] = cudf.Series(str_data, dtype="str") pdf["vals"] = other_data gdf["vals"] = other_data pdf2 = pd.DataFrame() - gdf2 = DataFrame() + gdf2 = cudf.DataFrame() pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = Series(str_data_nulls, dtype="str") + gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = Series(other_data_nulls, dtype="int64") + gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") @@ -955,10 +955,10 @@ def test_string_join_non_key(str_data, num_cols, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -993,18 +993,18 @@ def test_string_join_non_key_nulls(str_data_nulls): other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = Series(str_data, dtype="str") + gdf["vals"] = cudf.Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() - gdf2 = DataFrame() + gdf2 = cudf.DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = Series(str_data_nulls, dtype="str") + gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = Series(other_data_nulls, dtype="int64") + gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") @@ -1044,8 +1044,8 @@ def test_string_join_values_nulls(): left_pdf = pd.DataFrame(left_dict) right_pdf = pd.DataFrame(right_dict) - left_gdf = DataFrame.from_pandas(left_pdf) - right_gdf = DataFrame.from_pandas(right_pdf) + left_gdf = cudf.DataFrame.from_pandas(left_pdf) + right_gdf = cudf.DataFrame.from_pandas(right_pdf) expect = left_pdf.merge(right_pdf, how="left", on="b") got = left_gdf.merge(right_gdf, how="left", on="b") @@ -1064,10 +1064,10 @@ def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -1089,10 +1089,10 @@ def test_string_groupby_non_key(str_data, num_cols, agg): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data @@ -1114,9 +1114,9 @@ def test_string_groupby_key_index(): other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() - gdf = DataFrame() + gdf = cudf.DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") - gdf["a"] = Series(str_data, dtype="str") + gdf["a"] = cudf.Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data @@ -1130,7 +1130,7 @@ def test_string_groupby_key_index(): def test_string_set_scalar(scalar): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3, 4, 5] - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) pdf["b"] = "a" gdf["b"] = "a" @@ -1140,10 +1140,8 @@ def test_string_set_scalar(scalar): def test_string_index(): - from cudf.core.column import as_column - pdf = pd.DataFrame(np.random.rand(5, 5)) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex gdf.index = stringIndex @@ -1156,7 +1154,9 @@ def test_string_index(): pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name") + stringIndex = cudf.Index( + cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name" + ) pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) @@ -1174,7 +1174,7 @@ def test_string_index(): ) def test_string_unique(item): ps = pd.Series(item) - gs = Series(item) + gs = cudf.Series(item) # Pandas `unique` returns a numpy array pres = pd.Series(ps.unique()) # cudf returns sorted unique with `None` placed before other strings @@ -1184,12 +1184,12 @@ def test_string_unique(item): def test_string_slice(): - df = DataFrame({"a": ["hello", "world"]}) + df = cudf.DataFrame({"a": ["hello", "world"]}) pdf = pd.DataFrame({"a": ["hello", "world"]}) a_slice_got = df.a.str.slice(0, 2) a_slice_expected = pdf.a.str.slice(0, 2) - assert isinstance(a_slice_got, Series) + assert isinstance(a_slice_got, cudf.Series) assert_eq(a_slice_expected, a_slice_got) @@ -1199,8 +1199,8 @@ def test_string_equality(): ps1 = pd.Series(data1) ps2 = pd.Series(data2) - gs1 = Series(data1) - gs2 = Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) expect = ps1 == ps2 got = gs1 == gs2 @@ -1213,7 +1213,7 @@ def test_string_equality(): assert_eq(expect, got.fillna(False)) ps1 = pd.Series(["a"]) - gs1 = Series(["a"]) + gs1 = cudf.Series(["a"]) expect = ps1 == "m" got = gs1 == "m" @@ -1237,7 +1237,7 @@ def test_string_equality(): ) def test_string_binary_op_add(lhs, rhs): pds = pd.Series(lhs) + pd.Series(rhs) - gds = Series(lhs) + Series(rhs) + gds = cudf.Series(lhs) + cudf.Series(rhs) assert_eq(pds, gds) @@ -1282,7 +1282,7 @@ def test_string_no_children_properties(): ) def test_string_get(string, index): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq( pds.str.get(index).fillna(""), gds.str.get(index).fillna(""), @@ -1305,7 +1305,7 @@ def test_string_get(string, index): ) def test_string_slice_str(string, number, diff): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) @@ -1323,11 +1323,11 @@ def test_string_slice_str(string, number, diff): def test_string_slice_from(): - gs = Series(["hello world", "holy accéntéd", "batman", None, ""]) - d_starts = Series([2, 3, 0, -1, -1], dtype=np.int32) - d_stops = Series([-1, -1, 0, -1, -1], dtype=np.int32) + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) + d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) + d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) - expected = Series(["llo world", "y accéntéd", "", None, ""]) + expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) assert_eq(got, expected) @@ -1344,7 +1344,7 @@ def test_string_slice_from(): @pytest.mark.parametrize("repr", ["2", "!!"]) def test_string_slice_replace(string, number, diff, repr): pds = pd.Series(string) - gds = Series(string) + gds = cudf.Series(string) assert_eq( pds.str.slice_replace(start=number, repl=repr), @@ -1368,7 +1368,7 @@ def test_string_slice_replace(string, number, diff, repr): def test_string_insert(): - gs = Series(["hello world", "holy accéntéd", "batman", None, ""]) + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) ps = pd.Series(["hello world", "holy accéntéd", "batman", None, ""]) @@ -1422,7 +1422,7 @@ def test_string_insert(): ) @pytest.mark.parametrize("data", _string_char_types_data) def test_string_char_types(type_op, data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(getattr(gs.str, type_op)(), getattr(ps.str, type_op)()) @@ -1438,8 +1438,8 @@ def test_string_filter_alphanum(): rs = rs + c expected.append(rs) - gs = Series(data) - assert_eq(gs.str.filter_alphanum(), Series(expected)) + gs = cudf.Series(data) + assert_eq(gs.str.filter_alphanum(), cudf.Series(expected)) expected = [] for st in data: @@ -1448,7 +1448,7 @@ def test_string_filter_alphanum(): if not str.isalnum(c): rs = rs + c expected.append(rs) - assert_eq(gs.str.filter_alphanum(keep=False), Series(expected)) + assert_eq(gs.str.filter_alphanum(keep=False), cudf.Series(expected)) expected = [] for st in data: @@ -1459,7 +1459,7 @@ def test_string_filter_alphanum(): else: rs = rs + "*" expected.append(rs) - assert_eq(gs.str.filter_alphanum("*"), Series(expected)) + assert_eq(gs.str.filter_alphanum("*"), cudf.Series(expected)) expected = [] for st in data: @@ -1470,7 +1470,7 @@ def test_string_filter_alphanum(): else: rs = rs + "*" expected.append(rs) - assert_eq(gs.str.filter_alphanum("*", keep=False), Series(expected)) + assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected)) @pytest.mark.parametrize( @@ -1489,7 +1489,7 @@ def test_string_filter_alphanum(): ], ) def test_string_char_case(case_op, data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) s = gs.str @@ -1519,7 +1519,7 @@ def test_string_char_case(case_op, data): ], ) def test_strings_rpartition(data): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.rpartition(), gs.str.rpartition()) @@ -1538,7 +1538,7 @@ def test_strings_rpartition(data): ], ) def test_strings_partition(data): - gs = Series(data, name="str_name") + gs = cudf.Series(data, name="str_name") ps = pd.Series(data, name="str_name") assert_eq(ps.str.partition(), gs.str.partition()) @@ -1570,7 +1570,7 @@ def test_strings_partition(data): @pytest.mark.parametrize("n", [-1, 2, 1, 9]) @pytest.mark.parametrize("expand", [True, False, None]) def test_strings_rsplit(data, n, expand): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1606,7 +1606,7 @@ def test_strings_rsplit(data, n, expand): @pytest.mark.parametrize("n", [-1, 2, 1, 9]) @pytest.mark.parametrize("expand", [True, False, None]) def test_strings_split(data, n, expand): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1645,7 +1645,7 @@ def test_strings_split(data, n, expand): "to_strip", ["⅕", None, "123.", ".!? \n\t", "123.!? \n\t", " ", ".", ","] ) def test_strings_strip_tests(data, to_strip): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) @@ -1687,7 +1687,7 @@ def test_strings_strip_tests(data, to_strip): @pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) @pytest.mark.parametrize("fillchar", ["⅕", "1", ".", "t", " ", ","]) def test_strings_filling_tests(data, width, fillchar): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1733,7 +1733,7 @@ def test_strings_filling_tests(data, width, fillchar): ) @pytest.mark.parametrize("width", [0, 1, 4, 6, 9, 100]) def test_strings_zfill_tests(data, width): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) @@ -1761,7 +1761,7 @@ def test_strings_zfill_tests(data, width): ) @pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) def test_strings_pad_tests(data, width, side, fillchar): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1791,7 +1791,7 @@ def test_strings_pad_tests(data, width, side, fillchar): ) @pytest.mark.parametrize("width", [1, 4, 8, 12, 100]) def test_string_wrap(data, width): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq( @@ -1835,7 +1835,7 @@ def test_string_wrap(data, width): ) @pytest.mark.parametrize("pat", ["a", " ", "\t", "another", "0", r"\$"]) def test_string_count(data, pat): - gs = Series(data) + gs = cudf.Series(data) ps = pd.Series(data) assert_eq(gs.str.count(pat=pat), ps.str.count(pat=pat), check_dtype=False) @@ -1844,7 +1844,7 @@ def test_string_count(data, pat): def test_string_findall(): ps = pd.Series(["Lion", "Monkey", "Rabbit"]) - gs = Series(["Lion", "Monkey", "Rabbit"]) + gs = cudf.Series(["Lion", "Monkey", "Rabbit"]) assert_eq(ps.str.findall("Monkey")[1][0], gs.str.findall("Monkey")[0][1]) assert_eq(ps.str.findall("on")[0][0], gs.str.findall("on")[0][0]) @@ -1855,21 +1855,21 @@ def test_string_findall(): def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) - gs = Series(["hello", "goodbye"]) + gs = cudf.Series(["hello", "goodbye"]) expect = ps.str.replace("e", "E").str.replace("o", "O") got = gs.str.replace(["e", "o"], ["E", "O"]) assert_eq(expect, got) ps = pd.Series(["foo", "fuz", np.nan]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=True) got = gs.str.replace(["f."], ["ba"], regex=True) assert_eq(expect, got) ps = pd.Series(["f.o", "fuz", np.nan]) - gs = Series.from_pandas(ps) + gs = cudf.Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=False) got = gs.str.replace(["f."], ["ba"], regex=False) @@ -1905,7 +1905,7 @@ def test_string_replace_with_backrefs(find, replace): "tést-string-again", ] ps = pd.Series(s) - gs = Series(s) + gs = cudf.Series(s) got = gs.str.replace_with_backrefs(find, replace) expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected) @@ -1918,7 +1918,7 @@ def test_string_replace_with_backrefs(find, replace): def test_string_table_view_creation(): data = ["hi"] * 25 + [None] * 2027 psr = pd.Series(data) - gsr = Series.from_pandas(psr) + gsr = cudf.Series.from_pandas(psr) expect = psr[:1] got = gsr[:1] @@ -1944,7 +1944,7 @@ def test_string_table_view_creation(): ) def test_string_starts_ends(data, pat): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq( ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False @@ -1981,7 +1981,7 @@ def test_string_starts_ends(data, pat): ], ) def test_string_starts_ends_list_like_pat(data, pat): - gs = Series(data) + gs = cudf.Series(data) starts_expected = [] ends_expected = [] @@ -2020,7 +2020,7 @@ def test_string_starts_ends_list_like_pat(data, pat): ) def test_string_find(data, sub): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) got = gs.str.find(sub) expect = ps.str.find(sub) @@ -2090,7 +2090,7 @@ def test_string_find(data, sub): ) def test_string_str_index(data, sub, er): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) if er is None: assert_eq(ps.str.index(sub), gs.str.index(sub), check_dtype=False) @@ -2129,7 +2129,7 @@ def test_string_str_index(data, sub, er): ) def test_string_str_rindex(data, sub, er): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) if er is None: assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) @@ -2186,10 +2186,10 @@ def test_string_str_rindex(data, sub, er): ], ) def test_string_contains_multi(data, sub, expect): - gs = Series(data) - sub = Series(sub) + gs = cudf.Series(data) + sub = cudf.Series(sub) got = gs.str.contains(sub) - expect = Series(expect) + expect = cudf.Series(expect) assert_eq(expect, got, check_dtype=False) @@ -2209,7 +2209,7 @@ def test_string_contains_multi(data, sub, expect): @pytest.mark.parametrize("pat", ["", " ", "a", "abc", "cat", "$", "\n"]) def test_string_str_match(data, pat): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq(ps.str.match(pat), gs.str.match(pat)) assert_eq( @@ -2232,7 +2232,7 @@ def test_string_str_match(data, pat): ) def test_string_str_translate(data): ps = pd.Series(data) - gs = Series(data) + gs = cudf.Series(data) assert_eq( ps.str.translate(str.maketrans({"a": "z"})), @@ -2287,15 +2287,17 @@ def test_string_str_filter_characters(): "$1.50", "", ] - gs = Series(data) - expected = Series(["helloworld", "ABCD", "", "accnt", None, "150", ""]) + gs = cudf.Series(data) + expected = cudf.Series( + ["helloworld", "ABCD", "", "accnt", None, "150", ""] + ) filter = {"a": "z", "A": "Z", "0": "9"} assert_eq(expected, gs.str.filter_characters(filter)) - expected = Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) + expected = cudf.Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) assert_eq(expected, gs.str.filter_characters(filter, False)) - expected = Series( + expected = cudf.Series( ["hello world", "A B C D", " ", "acc nt", None, " 1 50", ""] ) assert_eq(expected, gs.str.filter_characters(filter, True, " ")) @@ -2314,7 +2316,7 @@ def test_string_str_code_points(): " 1234 ", "XYZ", ] - gs = Series(data) + gs = cudf.Series(data) expected = [ 97, 98, @@ -2354,7 +2356,7 @@ def test_string_str_code_points(): 89, 90, ] - expected = Series(expected) + expected = cudf.Series(expected) assert_eq(expected, gs.str.code_points(), check_dtype=False) @@ -2369,9 +2371,7 @@ def test_string_str_code_points(): ], ) def test_string_str_url_encode(data): - import urllib.parse - - gs = Series(data) + gs = cudf.Series(data) got = gs.str.url_encode() expected = pd.Series([urllib.parse.quote(url, safe="~") for url in data]) @@ -2389,9 +2389,7 @@ def test_string_str_url_encode(data): ], ) def test_string_str_decode_url(data): - import urllib.parse - - gs = Series(data) + gs = cudf.Series(data) got = gs.str.url_decode() expected = pd.Series([urllib.parse.unquote(url) for url in data]) @@ -2413,7 +2411,7 @@ def test_string_str_decode_url(data): @pytest.mark.parametrize("obj_type", [None, "str", "category"]) def test_string_typecast(data, obj_type, dtype): psr = pd.Series(data, dtype=obj_type) - gsr = Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) expect = psr.astype(dtype=dtype) actual = gsr.astype(dtype=dtype) @@ -2452,7 +2450,7 @@ def test_string_typecast(data, obj_type, dtype): @pytest.mark.parametrize("obj_type", [None, "str", "category"]) def test_string_typecast_error(data, obj_type, dtype): psr = pd.Series(data, dtype=obj_type) - gsr = Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) assert_exceptions_equal( lfunc=psr.astype, @@ -2474,23 +2472,23 @@ def test_string_typecast_error(data, obj_type, dtype): ) def test_string_hex_to_int(data): - gsr = Series(data) + gsr = cudf.Series(data) got = gsr.str.htoi() - expected = Series([263988422296292, 0, 281474976710655]) + expected = cudf.Series([263988422296292, 0, 281474976710655]) assert_eq(expected, got) def test_string_ishex(): - gsr = Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) + gsr = cudf.Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) got = gsr.str.ishex() - expected = Series([False, None, True, True, True]) + expected = cudf.Series([False, None, True, True, True]) assert_eq(expected, got) def test_string_istimestamp(): - gsr = Series( + gsr = cudf.Series( [ "", None, @@ -2508,7 +2506,7 @@ def test_string_istimestamp(): ] ) got = gsr.str.istimestamp(r"%Y%m%d %H%M%S.%f%p%z") - expected = Series( + expected = cudf.Series( [ False, None, @@ -2529,8 +2527,10 @@ def test_string_istimestamp(): def test_string_ip4_to_int(): - gsr = Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"]) - expected = Series([0, None, 0, 698875905, 2130706433, 700776449]) + gsr = cudf.Series( + ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] + ) + expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) got = gsr.str.ip2int() @@ -2538,18 +2538,18 @@ def test_string_ip4_to_int(): def test_string_int_to_ipv4(): - gsr = Series([0, None, 0, 698875905, 2130706433, 700776449]) - expected = Series( + gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) + expected = cudf.Series( ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] ) - got = Series(gsr._column.int2ip()) + got = cudf.Series(gsr._column.int2ip()) assert_eq(expected, got) def test_string_isipv4(): - gsr = Series( + gsr = cudf.Series( [ "", None, @@ -2565,7 +2565,7 @@ def test_string_isipv4(): ] ) got = gsr.str.isipv4() - expected = Series( + expected = cudf.Series( [ False, None, @@ -2587,7 +2587,7 @@ def test_string_isipv4(): "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"})) ) def test_string_int_to_ipv4_dtype_fail(dtype): - gsr = Series([1, 2, 3, 4, 5]).astype(dtype) + gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype) with pytest.raises(TypeError): gsr._column.int2ip() @@ -2618,7 +2618,7 @@ def test_string_int_to_ipv4_dtype_fail(dtype): ) def test_string_str_subscriptable(data, index): psr = pd.Series(data) - gsr = Series(data) + gsr = cudf.Series(data) assert_eq(psr.str[index], gsr.str[index]) @@ -2640,8 +2640,8 @@ def test_string_str_subscriptable(data, index): ], ) def test_string_str_byte_count(data, expected): - sr = Series(data) - expected = Series(expected, dtype="int32") + sr = cudf.Series(data) + expected = cudf.Series(expected, dtype="int32") actual = sr.str.byte_count() assert_eq(expected, actual) @@ -2689,8 +2689,8 @@ def test_string_str_byte_count(data, expected): ], ) def test_str_isinteger(data, expected): - sr = Series(data, dtype="str") - expected = Series(expected) + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) actual = sr.str.isinteger() assert_eq(expected, actual) @@ -2745,8 +2745,8 @@ def test_str_isinteger(data, expected): ], ) def test_str_isfloat(data, expected): - sr = Series(data, dtype="str") - expected = Series(expected) + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) actual = sr.str.isfloat() assert_eq(expected, actual) @@ -2776,7 +2776,7 @@ def test_str_isfloat(data, expected): ) def test_str_min(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.min(), sr.min()) @@ -2801,7 +2801,7 @@ def test_str_min(data): ) def test_str_max(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.max(), sr.max()) @@ -2826,13 +2826,13 @@ def test_str_max(data): ) def test_str_sum(data): psr = pd.Series(data) - sr = Series(data) + sr = cudf.Series(data) assert_eq(psr.sum(), sr.sum()) def test_str_mean(): - sr = Series(["a", "b", "c", "d", "e"]) + sr = cudf.Series(["a", "b", "c", "d", "e"]) with pytest.raises(TypeError): sr.mean() @@ -2840,7 +2840,7 @@ def test_str_mean(): def test_string_product(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.product, @@ -2853,7 +2853,7 @@ def test_string_product(): def test_string_var(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.var, rfunc=sr.var, compare_error_message=False @@ -2862,7 +2862,7 @@ def test_string_var(): def test_string_std(): psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) assert_exceptions_equal( lfunc=psr.std, rfunc=sr.std, compare_error_message=False From c3c3e6826be007668c68bb47a687e283d80ffa24 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 15:03:44 -0800 Subject: [PATCH 22/35] remove versioning of cudf api call --- python/cudf/cudf/testing/testing.py | 33 ++++++++++------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index ec1af0b7321..bacab24a6f3 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -615,28 +615,17 @@ def assert_frame_equal( right = right[list(left._data.names)] # index comparison - if PANDAS_GE_110: - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - else: - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) if PANDAS_GE_110: pd.testing.assert_index_equal( From 992b483ce33bd22cb18aec60a54253decc587707 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 24 Feb 2021 17:04:32 -0600 Subject: [PATCH 23/35] Update python/cudf/cudf/tests/test_setitem.py Co-authored-by: Keith Kraus --- python/cudf/cudf/tests/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 57661511f5b..fc885a13808 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -23,7 +23,7 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value): @pytest.mark.xfail( condition=not PANDAS_GE_120, reason="pandas incorrectly adds nulls with dataframes " - "but works fine with scalers", + "but works fine with scalars", ) def test_dataframe_setitem_scaler_bool(): df = pd.DataFrame({"a": [1, 2, 3]}) From 355e1923280692544814a1da1e4f7427ed4143ff Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 15:26:18 -0800 Subject: [PATCH 24/35] remove double validation --- python/cudf/cudf/core/column/column.py | 6 ------ python/cudf/cudf/core/column/datetime.py | 6 ------ python/cudf/cudf/core/column/timedelta.py | 6 ------ 3 files changed, 18 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 28dd521b37c..e69459010be 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -854,12 +854,6 @@ def isin(self, values: Sequence) -> ColumnBase: TypeError If values is a string """ - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - lhs = self rhs = None diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d32b3c2f8e2..638f339e757 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -337,12 +337,6 @@ def is_unique(self) -> bool: return self.as_numerical.is_unique def isin(self, values: Sequence) -> ColumnBase: - if cudf.utils.dtypes.is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - lhs = self rhs = None diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 82ce1f5f7a0..bd3e655bc79 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -369,12 +369,6 @@ def median(self, skipna: bool = None) -> pd.Timedelta: ) def isin(self, values: Sequence) -> ColumnBase: - if cudf.utils.dtypes.is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - lhs = self rhs = None From 8d06667fd70da9fe92742fdcc7e6c4bd1c576aec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 24 Feb 2021 15:48:30 -0800 Subject: [PATCH 25/35] move datetime / duration isin logic to a common utility --- python/cudf/cudf/core/column/column.py | 4 --- python/cudf/cudf/core/column/datetime.py | 20 +---------- python/cudf/cudf/core/column/timedelta.py | 22 +----------- python/cudf/cudf/core/tools/datetimes.py | 43 ++++++++++++++++++++++- 4 files changed, 44 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e69459010be..02cd7407802 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -849,10 +849,6 @@ def isin(self, values: Sequence) -> ColumnBase: ------- result: Column Column of booleans indicating if each element is in values. - Raises - ------- - TypeError - If values is a string """ lhs = self rhs = None diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 638f339e757..7c5385b9bbf 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -337,25 +337,7 @@ def is_unique(self) -> bool: return self.as_numerical.is_unique def isin(self, values: Sequence) -> ColumnBase: - lhs = self - rhs = None - - try: - rhs = cudf.core.column.as_column(values) - - if rhs.dtype.kind in {"f", "i", "u"}: - return cudf.core.column.full(len(self), False, dtype="bool") - rhs = rhs.astype(self.dtype) - res = lhs._isin_earlystop(rhs) - if res is not None: - return res - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return cudf.core.column.full(len(self), False, dtype="bool") - - res = lhs._obtain_isin_result(rhs) - return res + return cudf.core.tools.datetimes._isin_datetimelike(self, values) def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index bd3e655bc79..ac63192b692 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -369,27 +369,7 @@ def median(self, skipna: bool = None) -> pd.Timedelta: ) def isin(self, values: Sequence) -> ColumnBase: - lhs = self - rhs = None - - try: - rhs = cudf.core.column.as_column(values) - - if rhs.dtype.kind in {"f", "i", "u"}: - return cudf.core.column.full(len(self), False, dtype="bool") - - rhs = rhs.astype(self.dtype) - res = lhs._isin_earlystop(rhs) - if res is not None: - return res - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return cudf.core.column.full(len(self), False, dtype="bool") - - res = lhs._obtain_isin_result(rhs) - - return res + return cudf.core.tools.datetimes._isin_datetimelike(self, values) def quantile( self, q: Union[float, Sequence[float]], interpolation: str, exact: bool diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 206786fad42..4e5e4ce1987 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,6 +1,7 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. import warnings +from typing import Sequence, Union import numpy as np import pandas as pd @@ -497,3 +498,43 @@ def __setattr__(self, name, value): raise AttributeError("DateOffset objects are immutable.") else: object.__setattr__(self, name, value) + + +def _isin_datetimelike( + lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence +) -> column.ColumnBase: + """ + Check whether values are contained in the + DateTimeColumn or TimeDeltaColumn. + + Parameters + ---------- + lhs : TimeDeltaColumn or DatetimeColumn + Column to check whether the `values` exist in. + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a TypeError. Instead, turn a single string into a list + of one element. + + Returns + ------- + result: Column + Column of booleans indicating if each element is in values. + """ + rhs = None + try: + rhs = cudf.core.column.as_column(values) + + if rhs.dtype.kind in {"f", "i", "u"}: + return cudf.core.column.full(len(lhs), False, dtype="bool") + rhs = rhs.astype(lhs.dtype) + res = lhs._isin_earlystop(rhs) + if res is not None: + return res + except ValueError: + # pandas functionally returns all False when cleansing via + # typecasting fails + return cudf.core.column.full(len(lhs), False, dtype="bool") + + res = lhs._obtain_isin_result(rhs) + return res From dd842f311b5f1fa030bfb7af70a381ea6c913165 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Feb 2021 09:18:42 -0800 Subject: [PATCH 26/35] add atol --- python/dask_cudf/dask_cudf/tests/test_core.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index aebdb9fe5b9..e2b77ba192e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -12,10 +12,10 @@ from dask.dataframe.core import make_meta, meta_nonempty from dask.utils import M -import dask_cudf as dgd - import cudf +import dask_cudf as dgd + def test_from_cudf(): np.random.seed(0) @@ -719,7 +719,9 @@ def test_dataframe_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_exact=False) + dd.assert_eq( + ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001 + ) def test_index_map_partitions(): From 9fe44cd7ba9272e74c6880c2a39a2f416ab6fca1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Feb 2021 09:20:33 -0800 Subject: [PATCH 27/35] rename internal api --- python/cudf/cudf/core/dataframe.py | 4 +-- python/cudf/cudf/tests/test_dataframe.py | 18 +++++++------- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_repr.py | 4 +-- python/cudf/cudf/tests/test_rolling.py | 4 +-- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_stats.py | 6 ++--- python/cudf/cudf/utils/utils.py | 30 ++++++++++++++++++++++- 10 files changed, 51 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a22fdf65f9f..2afbad2d0e7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -584,7 +584,7 @@ def deserialize(cls, header, frames): @property def dtypes(self): """Return the dtypes in this object.""" - return cudf.utils.utils.create_pandas_series( + return cudf.utils.utils._create_pandas_series( data=[x.dtype for x in self._data.columns], index=self._data.names, ) @@ -690,7 +690,7 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - mask = cudf.utils.utils.create_pandas_series(data=mask) + mask = cudf.utils.utils._create_pandas_series(data=mask) if mask.dtype == "bool": return self._apply_boolean_mask(mask) else: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b72b3338342..a3bad0ab5a6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3331,7 +3331,7 @@ def test_all(data): # Pandas treats `None` in object type columns as True for some reason, so # replacing with `False` if np.array(data).ndim <= 1: - pdata = cudf.utils.utils.create_pandas_series(data=data).replace( + pdata = cudf.utils.utils._create_pandas_series(data=data).replace( [None], False ) gdata = cudf.Series.from_pandas(pdata) @@ -3386,7 +3386,7 @@ def test_all(data): @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): if np.array(data).ndim <= 1: - pdata = cudf.utils.utils.create_pandas_series(data=data) + pdata = cudf.utils.utils._create_pandas_series(data=data) gdata = cudf.Series.from_pandas(pdata) if axis == 1: @@ -3856,7 +3856,7 @@ def test_create_dataframe_column(): ], ) def test_series_values_host_property(data): - pds = cudf.utils.utils.create_pandas_series(data=data) + pds = cudf.utils.utils._create_pandas_series(data=data) gds = cudf.Series(data) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -3879,7 +3879,7 @@ def test_series_values_host_property(data): ], ) def test_series_values_property(data): - pds = cudf.utils.utils.create_pandas_series(data=data) + pds = cudf.utils.utils._create_pandas_series(data=data) gds = cudf.Series(data) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) @@ -3987,7 +3987,7 @@ def test_value_counts(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = cudf.utils.utils.create_pandas_series(data=data, index=index) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) @@ -4041,7 +4041,7 @@ def test_isin_numeric(data, values): ], ) def test_isin_datetime(data, values): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -4077,7 +4077,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -4106,7 +4106,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -4140,7 +4140,7 @@ def test_isin_categorical(data, values): ], ) def test_isin_index(data, values): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index b354f6b2f8a..d01627309d6 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -22,7 +22,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index d429f658451..f721b7a28e5 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -59,7 +59,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = cudf.utils.utils.create_pandas_series(data) + pds = cudf.utils.utils._create_pandas_series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index af25b48dd23..688efef555b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -992,7 +992,7 @@ def test_index_equal_misc(data, other): assert_eq(expected, actual) expected = pd_data.equals( - cudf.utils.utils.create_pandas_series(data=pd_other) + cudf.utils.utils._create_pandas_series(data=pd_other) ) actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 9cf8b3ac239..96cd3d23b57 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -159,7 +159,7 @@ def test_integer_dataframe(x): @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x) - ps = cudf.utils.utils.create_pandas_series(data=x) + ps = cudf.utils.utils._create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() @@ -176,7 +176,7 @@ def test_float_dataframe(x): @settings(deadline=None) def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) - ps = cudf.utils.utils.create_pandas_series(data=x) + ps = cudf.utils.utils._create_pandas_series(data=x) assert sr.__repr__() == ps.__repr__() diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 27236910ebb..fcc5591adda 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -39,7 +39,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = cudf.utils.utils.create_pandas_series(data=data, index=index) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.Series(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): @@ -214,7 +214,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = cudf.utils.utils.create_pandas_series(data=data, index=index) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d62942c2364..ab9d3d91f73 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -384,7 +384,7 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 1eae8ddbf1e..4e07c974280 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -203,7 +203,7 @@ def test_approx_quantiles_int(): @pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]]) def test_misc_quantiles(data, q): - pdf_series = cudf.utils.utils.create_pandas_series(data=data) + pdf_series = cudf.utils.utils._create_pandas_series(data=data) gdf_series = cudf.Series(data) expected = pdf_series.quantile(q) @@ -433,13 +433,13 @@ def test_df_corr(): ) @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series(data) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = cudf.utils.utils.create_pandas_series(data=data) + psr = cudf.utils.utils._create_pandas_series(data=data) gsr = cudf.Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index b0a1aff4ada..e8b8c53312a 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -625,9 +625,37 @@ def _categorical_scalar_broadcast_to(cat_scalar, size): ) -def create_pandas_series( +def _create_pandas_series( data=None, index=None, dtype=None, name=None, copy=False, fastpath=False ): + """ + Wrapper to create a Pandas Series. If the length of data is 0 and + dtype is not passed, this wrapper defaults the dtype to `float64`. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. If data is a dict, argument + order is maintained. + index : array-like or Index (1d) + Values must be hashable and have the same length as data. + Non-unique index values are allowed. Will default to + RangeIndex (0, 1, 2, …, n) if not provided. + If data is dict-like and index is None, then the keys + in the data are used as the index. If the index is not None, + the resulting Series is reindexed with the index values. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this + will be inferred from data. See the user guide for more usages. + name : str, optional + The name to give to the Series. + copy : bool, default False + Copy input data. + + Returns + ------- + pd.Series + """ if (data is None or len(data) == 0) and dtype is None: dtype = "float64" return pd.Series( From da1a3a3dfc8fab3c942dcad015cb70175cb1b853 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Feb 2021 17:01:11 -0800 Subject: [PATCH 28/35] fix categorical setitem and allow np.nan into categories --- python/cudf/cudf/core/column/categorical.py | 41 +++++++++++---- python/cudf/cudf/core/column/column.py | 12 +++-- python/cudf/cudf/core/index.py | 15 +++++- python/cudf/cudf/core/indexing.py | 6 ++- python/cudf/cudf/core/series.py | 10 +++- python/cudf/cudf/tests/test_categorical.py | 16 ++++++ python/cudf/cudf/tests/test_repr.py | 56 +++++++++++++++++++++ 7 files changed, 138 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0649f82256e..7a8a7d371f7 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -946,12 +946,14 @@ def unary_operator(self, unaryop: str): ) def __setitem__(self, key, value): - to_add_categories = cudf.Index(value).difference(self.categories) + if cudf.utils.dtypes.is_scalar( + value + ) and cudf._lib.scalar._is_null_host_scalar(value): + to_add_categories = [] + else: + to_add_categories = cudf.Index(value).difference(self.categories) - if ( - len(to_add_categories) - and not to_add_categories.isna()._values.all() - ): + if len(to_add_categories): raise ValueError( "Cannot setitem on a Categorical with a new " "category, set the categories first" @@ -1067,11 +1069,18 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: def to_pandas( self, index: ColumnLike = None, nullable: bool = False, **kwargs ) -> pd.Series: - signed_dtype = min_signed_type(len(self.categories)) - codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array() - categories = self.categories.to_pandas() + + if self.categories.isnull().any(): + col = self.copy(deep=True) + col[col.isnull()] = None + else: + col = self + + signed_dtype = min_signed_type(len(col.categories)) + codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() + categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=self.ordered + codes, categories=categories, ordered=col.ordered ) return pd.Series(data, index=index) @@ -1201,6 +1210,20 @@ def find_and_replace( ordered=self.dtype.ordered, ) + def isnull(self) -> ColumnBase: + """Identify missing values in a Column. + """ + result = libcudf.unary.is_null(self) + + if self.categories.dtype.kind == "f": + # Need to consider `np.nan` values incase + # of a float column + result = result | libcudf.unary.is_nan( + self.astype(self.categories.dtype) + ) + + return result + def fillna( self, fill_value: Any = None, method: Any = None, dtype: Dtype = None ) -> CategoricalColumn: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 02cd7407802..1bad2c3a451 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1063,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: # columns include null index in factorization; remove: if self.has_nulls: - cats = cats.dropna() + cats = cats._column.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) labels = labels - 1 if np.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) return build_categorical_column( - categories=cats._column, + categories=cats, codes=labels._column, mask=self.mask, ordered=ordered, @@ -2077,9 +2077,11 @@ def _construct_array( arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype - if dtype is None and pd.api.types.infer_dtype(arbitrary) in ( - "mixed", - "mixed-integer", + if ( + dtype is None + and not cudf._lib.scalar._is_null_host_scalar(arbitrary) + and pd.api.types.infer_dtype(arbitrary) + in ("mixed", "mixed-integer",) ): native_dtype = "object" arbitrary = np.asarray( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e3899a403f1..88f3f8c4c89 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1993,7 +1993,20 @@ def __repr__(self): # utilize `Index.to_string` once it is implemented # related issue : https://github.com/pandas-dev/pandas/issues/35389 if isinstance(preprocess, CategoricalIndex): - output = preprocess.to_pandas().__repr__() + if preprocess.categories.dtype.kind == "f": + output = ( + preprocess.astype("str") + .to_pandas() + .astype("category") + .__repr__() + ) + break_idx = output.find("ordered=") + output = ( + output[:break_idx].replace("'", "") + output[break_idx:] + ) + else: + output = preprocess.to_pandas().__repr__() + output = output.replace("nan", cudf._NA_REP) elif preprocess._values.nullable: output = self._clean_nulls_from_index().to_pandas().__repr__() diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 4d685408df3..cf372286b7e 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -95,8 +95,10 @@ def __setitem__(self, key, value): else: value = column.as_column(value) - if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype( - value.dtype + if ( + not is_categorical_dtype(self._sr._column.dtype) + and hasattr(value, "dtype") + and pd.api.types.is_numeric_dtype(value.dtype) ): # normalize types if necessary: if not pd.api.types.is_integer(key): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 72e468002db..86045397d46 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1070,7 +1070,13 @@ def __repr__(self): else get_option("display.min_rows") ) show_dimensions = get_option("display.show_dimensions") - output = preprocess.to_pandas().to_string( + if preprocess._column.categories.dtype.kind == "f": + pd_series = ( + preprocess.astype("str").to_pandas().astype("category") + ) + else: + pd_series = preprocess.to_pandas() + output = pd_series.to_string( name=self.name, dtype=self.dtype, min_rows=min_rows, @@ -1085,6 +1091,8 @@ def __repr__(self): if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): category_memory = lines[-1] + if preprocess._column.categories.dtype.kind == "f": + category_memory = category_memory.replace("'", "") lines = lines[:-1] if len(lines) > 1: if lines[-1].startswith("Name: "): diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 9779fb786f6..164e72048a7 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -762,3 +762,19 @@ def test_categorical_assignment(data, cat_dtype): pd_df.assign(cat_col=pd_categorical) cd_df.assign(cat_col=pd_categorical) assert_eq(pd_df, cd_df) + + +def test_categorical_allow_nan(): + gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) + gs = gs.astype("category") + expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") + assert_eq(expected_codes, gs.cat.codes) + + expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") + assert_eq(expected_categories, gs.cat.categories) + + actual_ps = gs.to_pandas() + expected_ps = pd.Series( + [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" + ) + assert_eq(actual_ps, expected_ps) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 96cd3d23b57..729ee60a82a 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1417,3 +1417,59 @@ def test_mulitIndex_null_repr(gdi, expected_repr): actual_repr = gdi.__repr__() assert actual_repr.split() == expected_repr.split() + + +def test_categorical_series_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + + expected_repr = textwrap.dedent( + """ + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + dtype: category + Categories (4, object): [1.0, 10.0, 2.0, NaN] + """ + ) + + assert series.__repr__().split() == expected_repr.split() + + +def test_categorical_dataframe_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + df = cudf.DataFrame({"a": series}) + expected_repr = textwrap.dedent( + """ + a + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + """ + ) + + assert df.__repr__().split() == expected_repr.split() + + +def test_categorical_index_with_nan_repr(): + cat_index = cudf.Index( + cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + ) + + expected_repr = ( + "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, ], " + "categories=[1.0, 10.0, 2.0, NaN], ordered=False, dtype='category')" + ) + + assert cat_index.__repr__() == expected_repr From e70686f549507f7bfab7315c58ca58e37ccefa88 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Feb 2021 17:10:15 -0800 Subject: [PATCH 29/35] add nan setitem test --- python/cudf/cudf/tests/test_categorical.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 164e72048a7..a117c15f14d 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -778,3 +778,15 @@ def test_categorical_allow_nan(): [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" ) assert_eq(actual_ps, expected_ps) + + +def test_categorical_setitem_with_nan(): + gs = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + gs[[1, 3]] = np.nan + + expected_series = cudf.Series( + [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False + ).astype(gs.dtype) + assert_eq(gs, expected_series) From 39ba07a3996ebc48c816d7f62f331dd5a4025874 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 25 Feb 2021 20:52:40 -0800 Subject: [PATCH 30/35] make null checks and to_pandas code flow more effecient --- python/cudf/cudf/core/column/categorical.py | 51 ++++++++++++++++----- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7a8a7d371f7..dc59727187c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -949,11 +949,13 @@ def __setitem__(self, key, value): if cudf.utils.dtypes.is_scalar( value ) and cudf._lib.scalar._is_null_host_scalar(value): - to_add_categories = [] + to_add_categories = 0 else: - to_add_categories = cudf.Index(value).difference(self.categories) + to_add_categories = len( + cudf.Index(value).difference(self.categories) + ) - if len(to_add_categories): + if to_add_categories > 0: raise ValueError( "Cannot setitem on a Categorical with a new " "category, set the categories first" @@ -1070,9 +1072,18 @@ def to_pandas( self, index: ColumnLike = None, nullable: bool = False, **kwargs ) -> pd.Series: - if self.categories.isnull().any(): - col = self.copy(deep=True) - col[col.isnull()] = None + if self.categories.dtype.kind == "f": + new_mask = bools_to_mask(self.notnull()) + col = column.build_categorical_column( + categories=self.dtype.categories._values, + codes=column.as_column( + self.codes.base_data, dtype=self.codes.dtype + ), + mask=new_mask, + ordered=self.dtype.ordered, + offset=self.offset, + size=self.size, + ) else: col = self @@ -1211,16 +1222,34 @@ def find_and_replace( ) def isnull(self) -> ColumnBase: - """Identify missing values in a Column. + """ + Identify missing values in a CategoricalColumn. """ result = libcudf.unary.is_null(self) if self.categories.dtype.kind == "f": # Need to consider `np.nan` values incase - # of a float column - result = result | libcudf.unary.is_nan( - self.astype(self.categories.dtype) - ) + # of an underlying float column + categories = libcudf.unary.is_nan(self.categories) + if categories.any(): + code = self._encode(np.nan) + result = result | (self.codes == cudf.Scalar(code)) + + return result + + def notnull(self) -> ColumnBase: + """ + Identify non-missing values in a CategoricalColumn. + """ + result = libcudf.unary.is_valid(self) + + if self.categories.dtype.kind == "f": + # Need to consider `np.nan` values incase + # of an underlying float column + categories = libcudf.unary.is_nan(self.categories) + if categories.any(): + code = self._encode(np.nan) + result = result & (self.codes != cudf.Scalar(code)) return result From 2cc496dc1a0b43e20bb7a8943ad5939db7cc576c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Feb 2021 01:09:53 -0800 Subject: [PATCH 31/35] fix repr --- python/cudf/cudf/core/column/categorical.py | 9 +++------ python/cudf/cudf/core/dtypes.py | 5 ++++- python/cudf/cudf/core/series.py | 9 ++++++++- python/cudf/cudf/tests/test_repr.py | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index dc59727187c..c41a458f02b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1075,14 +1075,11 @@ def to_pandas( if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( - categories=self.dtype.categories._values, - codes=column.as_column( - self.codes.base_data, dtype=self.codes.dtype - ), + categories=self.categories, + codes=column.as_column(self.codes, dtype=self.codes.dtype), mask=new_mask, ordered=self.dtype.ordered, - offset=self.offset, - size=self.size, + size=self.codes.size, ) else: col = self diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index f11f3692faf..2205c1821cb 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -56,7 +56,10 @@ def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: - categories = self.categories.to_pandas() + if self._categories.dtype.kind == "f": + categories = self.categories.dropna().to_pandas() + else: + categories = self.categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered) def _init_categories(self, categories: Any): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 86045397d46..be03fb147ff 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1092,7 +1092,14 @@ def __repr__(self): if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): category_memory = lines[-1] if preprocess._column.categories.dtype.kind == "f": - category_memory = category_memory.replace("'", "") + category_memory = category_memory.replace("'", "").split(": ") + category_memory = ( + category_memory[0].replace( + "object", preprocess._column.categories.dtype.name + ) + + ": " + + category_memory[1] + ) lines = lines[:-1] if len(lines) > 1: if lines[-1].startswith("Name: "): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 729ee60a82a..66e09f61869 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1433,7 +1433,7 @@ def test_categorical_series_with_nan_repr(): 4 NaN 5 dtype: category - Categories (4, object): [1.0, 10.0, 2.0, NaN] + Categories (4, float64): [1.0, 10.0, 2.0, NaN] """ ) From 0bd3bba26303f8eb94ddcb467f8b8c47bdfeac29 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Feb 2021 01:18:33 -0800 Subject: [PATCH 32/35] fix typo --- python/cudf/cudf/core/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2205c1821cb..78437f73b1a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -53,7 +53,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": ) def to_pandas(self) -> pd.CategoricalDtype: - if self.categories is None: + if self._categories is None: categories = None else: if self._categories.dtype.kind == "f": From 3d44f5f0f3e76ada04508953cc6a37916a733a49 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Feb 2021 01:21:25 -0800 Subject: [PATCH 33/35] fix typo --- python/cudf/cudf/core/dtypes.py | 4 ++-- python/cudf/cudf/core/index.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 78437f73b1a..218cc457d7d 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -53,10 +53,10 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": ) def to_pandas(self) -> pd.CategoricalDtype: - if self._categories is None: + if self.categories is None: categories = None else: - if self._categories.dtype.kind == "f": + if self.categories.dtype.kind == "f": categories = self.categories.dropna().to_pandas() else: categories = self.categories.to_pandas() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 88f3f8c4c89..b9bdb70de1b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -137,6 +137,10 @@ def __init__( def _values(self) -> ColumnBase: raise NotImplementedError + @property + def dtype(self): + raise NotImplementedError + def __getitem__(self, key): raise NotImplementedError() From c1c2d96f927af0f1081ac2925ad2d53203bb2161 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Feb 2021 02:26:13 -0800 Subject: [PATCH 34/35] update index code --- python/cudf/cudf/core/dtypes.py | 4 +++- python/cudf/cudf/core/index.py | 4 ---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 218cc457d7d..8b7d54b6715 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -56,7 +56,9 @@ def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: - if self.categories.dtype.kind == "f": + if isinstance( + self.categories, (cudf.Float32Index, cudf.Float64Index) + ): categories = self.categories.dropna().to_pandas() else: categories = self.categories.to_pandas() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b9bdb70de1b..88f3f8c4c89 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -137,10 +137,6 @@ def __init__( def _values(self) -> ColumnBase: raise NotImplementedError - @property - def dtype(self): - raise NotImplementedError - def __getitem__(self, key): raise NotImplementedError() From ae1b8c688cd471c40eff60d366588795bfd101f3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Feb 2021 11:53:07 -0800 Subject: [PATCH 35/35] add packaging conda install --- conda/recipes/cudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 9afc7094f27..21eb017eb23 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -45,6 +45,7 @@ requirements: - fsspec>=0.6.0 - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} - nvtx >=0.2.1 + - packaging test: requires: