Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Implement cudf.DateOffset for months #6775

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ad09abf
baseline implementation
brandon-b-miller Nov 16, 2020
ac47fdb
cleanup cython
brandon-b-miller Nov 16, 2020
3bce31e
basic __repr__
brandon-b-miller Nov 16, 2020
89b87bd
basic testing
brandon-b-miller Nov 16, 2020
48d67b4
handle negative values
brandon-b-miller Nov 16, 2020
56bae18
style
brandon-b-miller Nov 16, 2020
5f45f34
changelog
brandon-b-miller Nov 16, 2020
5005a57
Merge branch 'branch-0.17' into fea-month-sub-and-add
brandon-b-miller Nov 30, 2020
cef0fc1
fix negatives
brandon-b-miller Dec 1, 2020
69ebcf3
add a comment
brandon-b-miller Dec 1, 2020
438866f
style
brandon-b-miller Dec 1, 2020
e7145b9
cleanup logic
brandon-b-miller Dec 1, 2020
e91530c
Merge branch 'branch-0.17' into fea-month-sub-and-add
brandon-b-miller Dec 3, 2020
b8d824f
Merge branch 'branch-0.18' into fea-month-sub-and-add
Dec 4, 2020
93f5a48
progress
brandon-b-miller Dec 7, 2020
16c0eab
tweak datetimes.py
brandon-b-miller Dec 8, 2020
f079c54
inherit from pd.DateOffset
brandon-b-miller Dec 9, 2020
23a553d
intercept kwargs and convert to cudf scalars
brandon-b-miller Dec 10, 2020
98b81ca
pacify cython and libcudf
brandon-b-miller Dec 10, 2020
5f569e6
Update python/cudf/cudf/core/tools/datetimes.py
brandon-b-miller Dec 10, 2020
8d5a0cc
Update python/cudf/cudf/core/tools/datetimes.py
brandon-b-miller Dec 11, 2020
a0c927b
switch to NotImplementedError
brandon-b-miller Dec 11, 2020
24bab96
address reviews
brandon-b-miller Dec 14, 2020
68af465
updates
brandon-b-miller Dec 14, 2020
a41f515
Merge branch 'branch-0.18' into fea-month-sub-and-add
brandon-b-miller Dec 15, 2020
21957d1
use a cudf python scalar instead of devicescalar in rolling
brandon-b-miller Dec 15, 2020
3dfbd68
comments and cleanup
brandon-b-miller Dec 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- PR #6929 Add `Index.set_names` api
- PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support

- PR #6775 Implement cudf.DateOffset for months

## Improvements

- PR #6938 Pass numeric scalars of the same dtype through numeric binops
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
)
from cudf.core.reshape import concat, get_dummies, melt, merge_sorted
from cudf.core.series import isclose
from cudf.core.tools.datetimes import to_datetime
from cudf.core.tools.datetimes import to_datetime, DateOffset
from cudf.core.tools.numeric import to_numeric
from cudf.io import (
from_dlpack,
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,10 @@ cdef class Column:
return result


def make_column_from_scalar(DeviceScalar val, size_type size):
def make_column_from_scalar(object py_val, size_type size):

cdef DeviceScalar val = py_val.device_value

cdef const scalar* c_val = val.get_raw_ptr()
cdef unique_ptr[column] c_result
with nogil:
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/cpp/datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
cdef unique_ptr[column] extract_hour(const column_view& column) except +
cdef unique_ptr[column] extract_minute(const column_view& column) except +
cdef unique_ptr[column] extract_second(const column_view& column) except +
cdef unique_ptr[column] add_calendrical_months(
const column_view& timestamps,
const column_view& months
) except +
17 changes: 17 additions & 0 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,23 @@ from cudf._lib.column cimport Column
cimport cudf._lib.cpp.datetime as libcudf_datetime


def add_months(Column col, Column months):
# months must be int16 dtype
cdef unique_ptr[column] c_result
cdef column_view col_view = col.view()
cdef column_view months_view = months.view()

with nogil:
c_result = move(
libcudf_datetime.add_calendrical_months(
col_view,
months_view
)
)

return Column.from_unique_ptr(move(c_result))


def extract_datetime_component(Column col, object field):

cdef unique_ptr[column] c_result
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2108,5 +2108,5 @@ def full(size, fill_value, dtype=None):
"""

return libcudf.column.make_column_from_scalar(
as_device_scalar(fill_value, dtype), size
cudf.Scalar(fill_value, dtype), size
)
13 changes: 13 additions & 0 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def normalize_binop_value(self, other):
other = other.to_datetime64()
elif isinstance(other, pd.Timedelta):
other = other.to_timedelta64()
elif isinstance(other, cudf.DateOffset):
return other
if isinstance(other, np.datetime64):
if np.isnat(other):
return cudf.Scalar(None, dtype=self.dtype)
Expand Down Expand Up @@ -215,6 +217,8 @@ def quantile(self, q, interpolation, exact):
return result.astype(self.dtype)

def binary_operator(self, op, rhs, reflect=False):
if isinstance(rhs, cudf.DateOffset):
return binop_offset(self, rhs, op)
lhs, rhs = self, rhs
if op in ("eq", "ne", "lt", "gt", "le", "ge"):
out_dtype = np.bool
Expand Down Expand Up @@ -316,6 +320,15 @@ def binop(lhs, rhs, op, out_dtype):
return out


def binop_offset(lhs, rhs, op):
if rhs._is_no_op:
return lhs
else:
rhs = rhs._generate_column(len(lhs), op)
out = libcudf.datetime.add_months(lhs, rhs)
return out


def infer_format(element, **kwargs):
"""
Infers datetime format from a string, also takes cares for `ms` and `ns`
Expand Down
144 changes: 144 additions & 0 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import warnings

import numpy as np
import pandas as pd
from pandas.core.tools.datetimes import _unit_map

import cudf
Expand Down Expand Up @@ -331,3 +332,146 @@ def get_units(value):
return _unit_map[value.lower()]

return value


class _DateOffsetScalars(object):
def __init__(self, scalars):
self._gpu_scalars = scalars


class DateOffset(pd.DateOffset):
def __init__(self, n=1, normalize=False, **kwds):
"""
An object used for binary ops where calendrical arithmetic
is desired rather than absolute time arithmetic. Used to
add or subtract a whole number of periods, such as several
months or years, to a series or index of datetime dtype.
Works similarly to pd.DateOffset, and currently supports a
subset of its functionality. The arguments that aren't yet
supported are:
- years
- weeks
- days
- hours
- minutes
- seconds
- microseconds
- milliseconds
- nanoseconds
In addition, cuDF does not yet support DateOffset arguments
that 'replace' units in the datetime data being operated on
such as
- year
- month
- week
- day
- hour
- minute
- second
- microsecond
- millisecond
- nanosecond
Finally, cuDF does not yet support rounding via a `normalize`
keyword argument.

Parameters
----------
n : int, default 1
The number of time periods the offset represents.
**kwds
Temporal parameter that add to or replace the offset value.
Parameters that **add** to the offset (like Timedelta):
- months

See Also
--------
pandas.DateOffset : The equivalent Pandas object that this
object replicates

Examples
--------
>>> from cudf import DateOffset
>>> ts = cudf.Series([
"2000-01-01 00:00:00.012345678",
"2000-01-31 00:00:00.012345678",
"2000-02-29 00:00:00.012345678",
], dtype='datetime64[ns])
>>> ts + DateOffset(months=3)
0 2000-04-01 00:00:00.012345678
1 2000-04-30 00:00:00.012345678
2 2000-05-29 00:00:00.012345678
dtype: datetime64[ns]
>>> ts - DateOffset(months=12)
0 1999-01-01 00:00:00.012345678
1 1999-01-31 00:00:00.012345678
2 1999-02-28 00:00:00.012345678
dtype: datetime64[ns]
"""
if normalize:
raise NotImplementedError(
"normalize not yet supported for DateOffset"
)

# TODO: Pandas supports combinations
if len(kwds) > 1:
raise NotImplementedError("Multiple time units not yet supported")

all_possible_kwargs = {
"years",
"months",
"weeks",
"days",
"hours",
"minutes",
"seconds",
"microseconds",
"nanoseconds",
"year",
"month",
"week",
"day",
"hour",
"minute",
"second",
"microsecond",
"millisecond" "nanosecond",
}

supported_kwargs = {"months"}

scalars = {}
for k, v in kwds.items():
if k in all_possible_kwargs:
# Months must be int16
dtype = "int16" if k == "months" else None
scalars[k] = cudf.Scalar(v, dtype=dtype)

super().__init__(n=n, normalize=normalize, **kwds)

wrong_kwargs = set(kwds.keys()).difference(supported_kwargs)
if len(wrong_kwargs) > 0:
raise ValueError(
f"Keyword arguments '{','.join(list(wrong_kwargs))}'"
" are not yet supported in cuDF DateOffsets"
)
self._scalars = _DateOffsetScalars(scalars)

def _generate_column(self, size, op):
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
months = self._scalars._gpu_scalars["months"]
months = -months if op == "sub" else months
# TODO: pass a scalar instead of constructing a column
# https://github.com/rapidsai/cudf/issues/6990
col = cudf.core.column.as_column(months, length=size)
return col

@property
def _is_no_op(self):
# some logic could be implemented here for more complex cases
# such as +1 year, -12 months
return all([i == 0 for i in self.kwds.values()])

def __setattr__(self, name, value):
if not isinstance(value, _DateOffsetScalars):
raise AttributeError("DateOffset objects are immutable.")
else:
object.__setattr__(self, name, value)
36 changes: 36 additions & 0 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,42 @@ def test_scalar_power_invalid(dtype_l, dtype_r):
lval_gpu ** rval_gpu


@pytest.mark.parametrize(
"date_col",
[
[
"2000-01-01 00:00:00.012345678",
"2000-01-31 00:00:00.012345678",
"2000-02-29 00:00:00.012345678",
]
],
)
@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
@pytest.mark.parametrize("frequency", ["months"])
@pytest.mark.parametrize(
"dtype",
["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
)
def test_datetime_dateoffset_binaryop(date_col, n_periods, frequency, dtype):
gsr = cudf.Series(date_col, dtype=dtype)
psr = gsr.to_pandas() # converts to nanos

kwargs = {frequency: n_periods}

goffset = cudf.DateOffset(**kwargs)
poffset = pd.DateOffset(**kwargs)

expect = psr + poffset
got = gsr + goffset

utils.assert_eq(expect, got)

expect = psr - poffset
got = gsr - goffset

utils.assert_eq(expect, got)


@pytest.mark.parametrize("frame", [cudf.Series, cudf.Index, cudf.DataFrame])
@pytest.mark.parametrize(
"dtype", ["int", "str", "datetime64[s]", "timedelta64[s]", "category"]
Expand Down