Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes #39652

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/python/api/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
FixedSizeBinaryArray
LargeBinaryArray
LargeStringArray
BinaryViewArray,
StringViewArray,
Time32Array
Time64Array
Date32Array
Expand Down Expand Up @@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
FixedSizeBinaryScalar
LargeBinaryScalar
LargeStringScalar
BinaryViewScalar
StringViewScalar
Time32Scalar
Time64Scalar
Date32Scalar
Expand Down
4 changes: 4 additions & 0 deletions docs/source/python/api/datatypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
large_binary
large_string
large_utf8
binary_view
string_view
decimal128
list_
large_list
Expand Down Expand Up @@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category
is_large_binary
is_large_unicode
is_large_string
is_binary_view
is_string_view
is_fixed_size_binary
is_map
is_dictionary
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def print_entry(label, value):
time32, time64, timestamp, date32, date64, duration,
month_day_nano_interval,
float16, float32, float64,
binary, string, utf8,
binary, string, utf8, binary_view, string_view,
large_binary, large_string, large_utf8,
decimal128, decimal256,
list_, large_list, map_, struct,
Expand Down Expand Up @@ -205,6 +205,7 @@ def print_entry(label, value):
FixedSizeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
BinaryViewArray, StringViewArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array, TimestampArray,
Expand All @@ -223,8 +224,8 @@ def print_entry(label, value):
Time32Scalar, Time64Scalar,
TimestampScalar, DurationScalar,
MonthDayNanoIntervalScalar,
BinaryScalar, LargeBinaryScalar,
StringScalar, LargeStringScalar,
BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
StringScalar, LargeStringScalar, StringViewScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, ExtensionScalar)
Expand Down
14 changes: 14 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2943,6 +2943,12 @@ cdef class LargeStringArray(Array):
null_count, offset)


cdef class StringViewArray(Array):
"""
Concrete class for Arrow arrays of string (or utf8) view data type.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding TODOs or linking future GH issues?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's unclear if we need to add any specific method here (at least it's not needed for the current TODO items in the parent issue), so going to leave this as is for now.



cdef class BinaryArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary data type.
Expand All @@ -2969,6 +2975,12 @@ cdef class LargeBinaryArray(Array):
return (<CLargeBinaryArray*> self.ap).total_values_length()


cdef class BinaryViewArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary view data type.
"""


cdef class DictionaryArray(Array):
"""
Concrete class for dictionary-encoded Arrow arrays.
Expand Down Expand Up @@ -3670,6 +3682,8 @@ cdef dict _array_classes = {
_Type_STRING: StringArray,
_Type_LARGE_BINARY: LargeBinaryArray,
_Type_LARGE_STRING: LargeStringArray,
_Type_BINARY_VIEW: BinaryViewArray,
_Type_STRING_VIEW: StringViewArray,
_Type_DICTIONARY: DictionaryArray,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
_Type_DECIMAL128: Decimal128Array,
Expand Down
66 changes: 66 additions & 0 deletions python/pyarrow/builder.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):

def __len__(self):
return self.builder.get().length()


cdef class StringViewBuilder(_Weakrefable):
"""
Builder class for UTF8 string views.

This class exposes facilities for incrementally adding string values and
building the null bitmap for a pyarrow.Array (type='string_view').
"""
cdef:
unique_ptr[CStringViewBuilder] builder

def __cinit__(self, MemoryPool memory_pool=None):
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
self.builder.reset(new CStringViewBuilder(pool))

def append(self, value):
"""
Append a single value to the builder.

The value can either be a string/bytes object or a null value
(np.nan or None).

Parameters
----------
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if value is None or value is np.nan:
self.builder.get().AppendNull()
elif isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
else:
raise TypeError('StringViewBuilder only accepts string objects')

def append_values(self, values):
"""
Append all the values from an iterable.

Parameters
----------
values : iterable of string/bytes or np.nan/None values
The values to append to the string array builder.
"""
for value in values:
self.append(value)

def finish(self):
"""
Return result of builder as an Array object; also resets the builder.

Returns
-------
array : pyarrow.Array
"""
cdef shared_ptr[CArray] out
with nogil:
self.builder.get().Finish(&out)
return pyarrow_wrap_array(out)

@property
def null_count(self):
return self.builder.get().null_count()

def __len__(self):
return self.builder.get().length()
9 changes: 9 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
_Type_LARGE_STRING" arrow::Type::LARGE_STRING"
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
_Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
_Type_STRING_VIEW" arrow::Type::STRING_VIEW"

_Type_LIST" arrow::Type::LIST"
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
Expand Down Expand Up @@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil:

cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
CStringBuilder(CMemoryPool* pool)
CStatus Append(const c_string& value)

cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
CStatus Append(const char* value, int32_t length)

cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder):
CStringViewBuilder(CMemoryPool* pool)
CStatus Append(const c_string& value)

cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
pass


cdef class StringViewArray(Array):
pass


cdef class BinaryViewArray(Array):
pass


cdef class DictionaryArray(Array):
cdef:
object _indices, _dictionary
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_BINARY_VIEW = _Type_BINARY_VIEW
Type_STRING_VIEW = _Type_STRING_VIEW
Type_LIST = _Type_LIST
Type_LARGE_LIST = _Type_LARGE_LIST
Type_MAP = _Type_MAP
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
pass


cdef class BinaryViewScalar(BinaryScalar):
pass


cdef class StringViewScalar(StringScalar):
pass


cdef class ListScalar(Scalar):
"""
Concrete class for list-like scalars.
Expand Down Expand Up @@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
_Type_BINARY: BinaryScalar,
_Type_LARGE_BINARY: LargeBinaryScalar,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
_Type_BINARY_VIEW: BinaryViewScalar,
_Type_STRING: StringScalar,
_Type_LARGE_STRING: LargeStringScalar,
_Type_STRING_VIEW: StringViewScalar,
_Type_LIST: ListScalar,
_Type_LARGE_LIST: LargeListScalar,
_Type_FIXED_SIZE_LIST: FixedSizeListScalar,
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(STRING, utf8);
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
default:
return nullptr;
Expand Down
21 changes: 20 additions & 1 deletion python/pyarrow/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numpy as np

import pyarrow as pa
from pyarrow.lib import StringBuilder
from pyarrow.lib import StringBuilder, StringViewBuilder


def test_weakref():
Expand Down Expand Up @@ -65,3 +65,22 @@ def test_string_builder_append_after_finish():
sbuilder.append("No effect")
expected = [None, None, "text", None, "other text"]
assert arr.to_pylist() == expected

rok marked this conversation as resolved.
Show resolved Hide resolved

def test_string_view_builder():
builder = StringViewBuilder()
builder.append(b"a byte string")
builder.append("a string")
builder.append("a longer not-inlined string")
builder.append(np.nan)
builder.append_values([None, "text"])
assert len(builder) == 6
assert builder.null_count == 2
arr = builder.finish()
assert isinstance(arr, pa.Array)
assert arr.null_count == 2
assert arr.type == 'string_view'
expected = [
"a byte string", "a string", "a longer not-inlined string", None, None, "text"
]
assert arr.to_pylist() == expected
4 changes: 4 additions & 0 deletions python/pyarrow/tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows():
pa.UnionArray,
pa.BinaryArray,
pa.StringArray,
pa.BinaryViewArray,
pa.StringViewArray,
pa.FixedSizeBinaryArray,
pa.DictionaryArray,
pa.Date32Array,
Expand Down Expand Up @@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows():
pa.StringScalar,
pa.BinaryScalar,
pa.FixedSizeBinaryScalar,
pa.BinaryViewScalar,
pa.StringViewScalar,
pa.ListScalar,
pa.LargeListScalar,
pa.MapScalar,
Expand Down
28 changes: 26 additions & 2 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
(b"bytes", None, pa.BinaryScalar),
("largestring", pa.large_string(), pa.LargeStringScalar),
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
# TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented
# ("string_view", pa.string_view(), pa.StringViewScalar),
# (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
([1, 2, 3], None, pa.ListScalar),
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
Expand Down Expand Up @@ -488,7 +491,8 @@ def test_month_day_nano_interval():
@pytest.mark.parametrize('value', ['foo', 'mañana'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.string(), pa.StringScalar),
(pa.large_string(), pa.LargeStringScalar)
(pa.large_string(), pa.LargeStringScalar),
# (pa.string_view(), pa.StringViewScalar),
])
def test_string(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand All @@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ):
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', ['foo', 'mañana'])
def test_string_view(value):
# TODO: replace with normal scalar construction
builder = pa.lib.StringViewBuilder()
builder.append(value)
arr = builder.finish()

s = arr[0]
assert isinstance(s, pa.StringViewScalar)
assert s.as_py() == value
assert s.as_py() != 'something'
assert repr(value) in repr(s)
assert str(s) == str(value)

buf = s.as_buffer()
assert isinstance(buf, pa.Buffer)
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.binary(), pa.BinaryScalar),
(pa.large_binary(), pa.LargeBinaryScalar)
(pa.large_binary(), pa.LargeBinaryScalar),
# (pa.binary_view(), pa.BinaryViewScalar),
])
def test_binary(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def get_many_types():
pa.binary(10),
pa.large_string(),
pa.large_binary(),
pa.string_view(),
pa.binary_view(),
pa.list_(pa.int32()),
pa.list_(pa.int32(), 2),
pa.large_list(pa.uint16()),
Expand Down Expand Up @@ -244,6 +246,12 @@ def test_is_binary_string():
assert types.is_fixed_size_binary(pa.binary(5))
assert not types.is_fixed_size_binary(pa.binary())

assert types.is_string_view(pa.string_view())
assert not types.is_string_view(pa.string())
assert types.is_binary_view(pa.binary_view())
assert not types.is_binary_view(pa.binary())
assert not types.is_binary_view(pa.string_view())


def test_is_temporal_date_time_timestamp():
date_types = [pa.date32(), pa.date64()]
Expand Down
Loading
Loading