From 284c904481451bfe2be46b2c764cdb111d394613 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 15 Aug 2024 16:32:28 -0400 Subject: [PATCH 1/4] Some missing linkages --- src/akimbo/datetimes.py | 3 +-- src/akimbo/mixin.py | 13 +++++++++++++ src/akimbo/strings.py | 11 ++++++++++- tests/test_dt.py | 24 +++++++++++++++++------- tests/test_str.py | 6 +++--- 5 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/akimbo/datetimes.py b/src/akimbo/datetimes.py index d3e2a6b..9f78239 100644 --- a/src/akimbo/datetimes.py +++ b/src/akimbo/datetimes.py @@ -20,7 +20,6 @@ def __init__(self, accessor) -> None: # listed below https://arrow.apache.org/docs/python/generated/ # pyarrow.compute.ceil_temporal.html - cast = dec(pc.cast) # TODO: move to .ak ceil_temporal = dec_t(pc.ceil_temporal) floor_temporal = dec_t(pc.floor_temporal) reound_temporal = dec_t(pc.round_temporal) @@ -62,7 +61,7 @@ def __init__(self, accessor) -> None: weeks_between = dec_t(pc.weeks_between) years_between = dec_t(pc.years_between) - # TODO: strftime, strptime + strftime = dec_t(pc.strftime) # TODO: timezone conversion diff --git a/src/akimbo/mixin.py b/src/akimbo/mixin.py index 74923cb..a6d1296 100644 --- a/src/akimbo/mixin.py +++ b/src/akimbo/mixin.py @@ -5,6 +5,9 @@ from typing import Callable, Iterable import awkward as ak +import pyarrow.compute as pc + +from akimbo.apply_tree import dec methods = [ _ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper() @@ -142,6 +145,16 @@ def __init__(self, obj, behavior=None): self._obj = obj self._behavior = behavior + def __call__(self, *args, behavior=None, **kwargs): + return Accessor(self._obj, behavior=behavior) + + cast = dec(pc.cast) + + @property + def accessor(self): + # if we use `dec`, which expects to work on + return self + @classmethod def is_series(cls, data): return isinstance(data, cls.series_type) diff --git a/src/akimbo/strings.py b/src/akimbo/strings.py index 70f49a7..b9eb73b 100644 --- a/src/akimbo/strings.py +++ b/src/akimbo/strings.py @@ -4,6 +4,7 @@ from collections.abc import Callable import awkward as ak +import pyarrow.compute as pc from akimbo.apply_tree import dec from akimbo.mixin import Accessor @@ -50,6 +51,12 @@ def _decode(layout): if not aname.startswith(("_", "akstr_")) and not aname[0].isupper() ] +# make sensible defaults for strptime +strptime = functools.wraps(pc.strptime)( + lambda *args, format="%FT%T", unit="s", error_is_null=True, **kw: + pc.strptime(*args, format=format, unit=unit, error_is_null=error_is_null) +) + class StringAccessor: def __init__(self, accessor): @@ -92,8 +99,10 @@ def f(*args, **kwargs): return f + strptime = dec(strptime, match=match_string) + def __dir__(self) -> list[str]: - return sorted(methods) + return sorted(methods + ["strptime"]) Accessor.register_accessor("str", StringAccessor) diff --git a/tests/test_dt.py b/tests/test_dt.py index de44a57..0aa767c 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -2,6 +2,8 @@ import pytest +import pyarrow as pa + import akimbo.pandas # noqa pd = pytest.importorskip("pandas") @@ -9,7 +11,7 @@ def test_cast(): s = pd.Series([[0, 1], [1, 0], [2]]) - out = s.ak.dt.cast("timestamp[s]") + out = s.ak.cast("timestamp[s]") assert str(out.dtype) == "list[pyarrow]" assert out.to_list() == [ [datetime.datetime(1970, 1, 1, 0, 0), datetime.datetime(1970, 1, 1, 0, 0, 1)], @@ -20,7 +22,7 @@ def test_cast(): def test_unary_unit(): s = pd.Series([[0, 1], [1, 0], [2]]) - ts = s.ak.dt.cast("timestamp[s]") + ts = s.ak.cast("timestamp[s]") s2 = ts.ak.dt.second() assert s.to_list() == s2.to_list() @@ -35,8 +37,8 @@ def test_bad_type(): def test_binary(): s = pd.Series([[0, 1], [1, 0], [2]]) s2 = s.ak + 1 - ts1 = s.ak.dt.cast("timestamp[s]") - ts2 = s2.ak.dt.cast("timestamp[s]") + ts1 = s.ak.cast("timestamp[s]") + ts2 = s2.ak.cast("timestamp[s]") out = ts1.ak.dt.nanoseconds_between(ts2) assert out.tolist() == [ @@ -50,8 +52,8 @@ def test_binary(): def test_binary_with_kwargs(): s = pd.Series([[0, 1], [1, 0], [2]]) s2 = s.ak + int(24 * 3600 * 7 * 2.5) - ts1 = s.ak.dt.cast("timestamp[s]") - ts2 = s2.ak.dt.cast("timestamp[s]") + ts1 = s.ak.cast("timestamp[s]") + ts2 = s2.ak.cast("timestamp[s]") out = ts1.ak.dt.weeks_between(ts2, count_from_zero=False, week_start=2) assert out.tolist() == [[2, 2], [2, 2], [2]] @@ -64,8 +66,16 @@ def test_mixed_record(): s = pd.Series(data) # explicit select of where to apply transform - ts = s.ak.dt.cast("timestamp[s]", where="a") + ts = s.ak.cast("timestamp[s]", where="a") # implicit selection of timestamps s2 = ts.ak.dt.second() assert s2.to_list() == data + + +def test_text_conversion(): + s = pd.Series([["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]]) + s2 = s.ak.str.strptime() + breakpoint() + s3 = s2.ak.dt.strftime("%FT%T") + assert s3.tolist() == [["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]] diff --git a/tests/test_str.py b/tests/test_str.py index 9d50f28..e550532 100644 --- a/tests/test_str.py +++ b/tests/test_str.py @@ -31,11 +31,11 @@ def test_encode_decode(): def test_split(): - s = pd.Series(["hello world", "oio", ""]) + s = pd.Series(["hello world", "oio", pd.NA, ""]) s2 = s.ak.str.split_whitespace() - assert s2.tolist() == [["hello", "world"], ["oio"], [""]] + assert s2.tolist() == [["hello", "world"], ["oio"], pd.NA, [""]] s2 = s.ak.str.split_pattern("i") - assert s2.tolist() == [["hello world"], ["o", "o"], [""]] + assert s2.tolist() == [["hello world"], ["o", "o"], pd.NA, [""]] s = pd.Series([b"hello world", b"oio", b""]) s2 = s.ak.str.split_whitespace() From fb2f9dd539a5c13c4e9bdc2d840237a5f6ba8875 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 15 Aug 2024 16:37:21 -0400 Subject: [PATCH 2/4] remove breakpoint --- tests/test_dt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_dt.py b/tests/test_dt.py index 0aa767c..bc4b886 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -76,6 +76,5 @@ def test_mixed_record(): def test_text_conversion(): s = pd.Series([["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]]) s2 = s.ak.str.strptime() - breakpoint() s3 = s2.ak.dt.strftime("%FT%T") assert s3.tolist() == [["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]] From 34bbd59ffdd968760f2d0af9e6b277ca455389c2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 15 Aug 2024 16:53:23 -0400 Subject: [PATCH 3/4] akip win --- tests/test_dt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_dt.py b/tests/test_dt.py index bc4b886..19582e6 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -1,12 +1,12 @@ import datetime +import sys import pytest -import pyarrow as pa - import akimbo.pandas # noqa pd = pytest.importorskip("pandas") +WIN = "nt" in sys.platform def test_cast(): @@ -73,6 +73,7 @@ def test_mixed_record(): assert s2.to_list() == data +@pytest.mark.skipif(WIN, reason="arrow on windows needs a timezone database") def test_text_conversion(): s = pd.Series([["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]]) s2 = s.ak.str.strptime() From 08b1d767f072c8124a3f7439f8e47dd5d5c7849f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 15 Aug 2024 16:58:02 -0400 Subject: [PATCH 4/4] try that again --- tests/test_dt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dt.py b/tests/test_dt.py index 19582e6..9ae16ea 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -1,12 +1,12 @@ import datetime -import sys +import os import pytest import akimbo.pandas # noqa pd = pytest.importorskip("pandas") -WIN = "nt" in sys.platform +WIN = os.name == "nt" def test_cast():