diff --git a/CHANGELOG.md b/CHANGELOG.md index e22277c4a29..c8563b0d393 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ - PR #6929 Add `Index.set_names` api - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support +- PR #6775 Implement cudf.DateOffset for months + ## Improvements - PR #6938 Pass numeric scalars of the same dtype through numeric binops diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 91242b9ca06..3ebe3da09e3 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -63,7 +63,7 @@ ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted from cudf.core.series import isclose -from cudf.core.tools.datetimes import to_datetime +from cudf.core.tools.datetimes import to_datetime, DateOffset from cudf.core.tools.numeric import to_numeric from cudf.io import ( from_dlpack, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 47dda88dfde..c2f047fd0d5 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -567,7 +567,10 @@ cdef class Column: return result -def make_column_from_scalar(DeviceScalar val, size_type size): +def make_column_from_scalar(object py_val, size_type size): + + cdef DeviceScalar val = py_val.device_value + cdef const scalar* c_val = val.get_raw_ptr() cdef unique_ptr[column] c_result with nogil: diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 228db7f4013..20fdd2e842a 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -12,3 +12,7 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + + cdef unique_ptr[column] add_calendrical_months( + const column_view& timestamps, + const column_view& months + ) except + diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 6f5a9cdc161..3e40cb62f9c 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -9,6 +9,23 @@ from cudf._lib.column cimport Column cimport cudf._lib.cpp.datetime as libcudf_datetime +def add_months(Column col, Column months): + # months must be int16 dtype + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + cdef column_view months_view = months.view() + + with nogil: + c_result = move( + libcudf_datetime.add_calendrical_months( + col_view, + months_view + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + def extract_datetime_component(Column col, object field): cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d388ebb9985..e2157230993 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2108,5 +2108,5 @@ def full(size, fill_value, dtype=None): """ return libcudf.column.make_column_from_scalar( - as_device_scalar(fill_value, dtype), size + cudf.Scalar(fill_value, dtype), size ) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 040b1b42c52..f94c29f84b0 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -140,6 +140,8 @@ def normalize_binop_value(self, other): other = other.to_datetime64() elif isinstance(other, pd.Timedelta): other = other.to_timedelta64() + elif isinstance(other, cudf.DateOffset): + return other if isinstance(other, np.datetime64): if np.isnat(other): return cudf.Scalar(None, dtype=self.dtype) @@ -215,6 +217,8 @@ def quantile(self, q, interpolation, exact): return result.astype(self.dtype) def binary_operator(self, op, rhs, reflect=False): + if isinstance(rhs, cudf.DateOffset): + return binop_offset(self, rhs, op) lhs, rhs = self, rhs if op in ("eq", "ne", "lt", "gt", "le", "ge"): out_dtype = np.bool @@ -316,6 +320,15 @@ def binop(lhs, rhs, op, out_dtype): return out +def binop_offset(lhs, rhs, op): + if rhs._is_no_op: + return lhs + else: + rhs = rhs._generate_column(len(lhs), op) + out = libcudf.datetime.add_months(lhs, rhs) + return out + + def infer_format(element, **kwargs): """ Infers datetime format from a string, also takes cares for `ms` and `ns` diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 732aafee3c0..e2da0849670 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -3,6 +3,7 @@ import warnings import numpy as np +import pandas as pd from pandas.core.tools.datetimes import _unit_map import cudf @@ -331,3 +332,146 @@ def get_units(value): return _unit_map[value.lower()] return value + + +class _DateOffsetScalars(object): + def __init__(self, scalars): + self._gpu_scalars = scalars + + +class DateOffset(pd.DateOffset): + def __init__(self, n=1, normalize=False, **kwds): + """ + An object used for binary ops where calendrical arithmetic + is desired rather than absolute time arithmetic. Used to + add or subtract a whole number of periods, such as several + months or years, to a series or index of datetime dtype. + Works similarly to pd.DateOffset, and currently supports a + subset of its functionality. The arguments that aren't yet + supported are: + - years + - weeks + - days + - hours + - minutes + - seconds + - microseconds + - milliseconds + - nanoseconds + In addition, cuDF does not yet support DateOffset arguments + that 'replace' units in the datetime data being operated on + such as + - year + - month + - week + - day + - hour + - minute + - second + - microsecond + - millisecond + - nanosecond + Finally, cuDF does not yet support rounding via a `normalize` + keyword argument. + + Parameters + ---------- + n : int, default 1 + The number of time periods the offset represents. + **kwds + Temporal parameter that add to or replace the offset value. + Parameters that **add** to the offset (like Timedelta): + - months + + See Also + -------- + pandas.DateOffset : The equivalent Pandas object that this + object replicates + + Examples + -------- + >>> from cudf import DateOffset + >>> ts = cudf.Series([ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], dtype='datetime64[ns]) + >>> ts + DateOffset(months=3) + 0 2000-04-01 00:00:00.012345678 + 1 2000-04-30 00:00:00.012345678 + 2 2000-05-29 00:00:00.012345678 + dtype: datetime64[ns] + >>> ts - DateOffset(months=12) + 0 1999-01-01 00:00:00.012345678 + 1 1999-01-31 00:00:00.012345678 + 2 1999-02-28 00:00:00.012345678 + dtype: datetime64[ns] + """ + if normalize: + raise NotImplementedError( + "normalize not yet supported for DateOffset" + ) + + # TODO: Pandas supports combinations + if len(kwds) > 1: + raise NotImplementedError("Multiple time units not yet supported") + + all_possible_kwargs = { + "years", + "months", + "weeks", + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "nanoseconds", + "year", + "month", + "week", + "day", + "hour", + "minute", + "second", + "microsecond", + "millisecond" "nanosecond", + } + + supported_kwargs = {"months"} + + scalars = {} + for k, v in kwds.items(): + if k in all_possible_kwargs: + # Months must be int16 + dtype = "int16" if k == "months" else None + scalars[k] = cudf.Scalar(v, dtype=dtype) + + super().__init__(n=n, normalize=normalize, **kwds) + + wrong_kwargs = set(kwds.keys()).difference(supported_kwargs) + if len(wrong_kwargs) > 0: + raise ValueError( + f"Keyword arguments '{','.join(list(wrong_kwargs))}'" + " are not yet supported in cuDF DateOffsets" + ) + self._scalars = _DateOffsetScalars(scalars) + + def _generate_column(self, size, op): + months = self._scalars._gpu_scalars["months"] + months = -months if op == "sub" else months + # TODO: pass a scalar instead of constructing a column + # https://github.com/rapidsai/cudf/issues/6990 + col = cudf.core.column.as_column(months, length=size) + return col + + @property + def _is_no_op(self): + # some logic could be implemented here for more complex cases + # such as +1 year, -12 months + return all([i == 0 for i in self.kwds.values()]) + + def __setattr__(self, name, value): + if not isinstance(value, _DateOffsetScalars): + raise AttributeError("DateOffset objects are immutable.") + else: + object.__setattr__(self, name, value) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 86e35d9bd21..be1cef50ec3 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1461,6 +1461,42 @@ def test_scalar_power_invalid(dtype_l, dtype_r): lval_gpu ** rval_gpu +@pytest.mark.parametrize( + "date_col", + [ + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ] + ], +) +@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12]) +@pytest.mark.parametrize("frequency", ["months"]) +@pytest.mark.parametrize( + "dtype", + ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], +) +def test_datetime_dateoffset_binaryop(date_col, n_periods, frequency, dtype): + gsr = cudf.Series(date_col, dtype=dtype) + psr = gsr.to_pandas() # converts to nanos + + kwargs = {frequency: n_periods} + + goffset = cudf.DateOffset(**kwargs) + poffset = pd.DateOffset(**kwargs) + + expect = psr + poffset + got = gsr + goffset + + utils.assert_eq(expect, got) + + expect = psr - poffset + got = gsr - goffset + + utils.assert_eq(expect, got) + + @pytest.mark.parametrize("frame", [cudf.Series, cudf.Index, cudf.DataFrame]) @pytest.mark.parametrize( "dtype", ["int", "str", "datetime64[s]", "timedelta64[s]", "category"]