Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-39852: [Python] Support creating Binary/StringView arrays from python objects #39853

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,10 @@ class PyValue {
return view.ParseString(obj);
}

static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) {
return view.ParseString(obj);
}

static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
Expand All @@ -499,8 +503,8 @@ class PyValue {
}

template <typename T>
static enable_if_string<T, Status> Convert(const T*, const O& options, I obj,
PyBytesView& view) {
static enable_if_t<is_string_type<T>::value || is_string_view_type<T>::value, Status>
Convert(const T*, const O& options, I obj, PyBytesView& view) {
if (options.strict) {
// Strict conversion, force output to be unicode / utf8 and validate that
// any binary values are utf8
Expand Down Expand Up @@ -570,18 +574,12 @@ struct PyConverterTrait;

template <typename T>
struct PyConverterTrait<
T,
enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
!is_extension_type<T>::value && !is_binary_view_like_type<T>::value) ||
std::is_same<T, MonthDayNanoIntervalType>::value>> {
T, enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
!is_extension_type<T>::value) ||
std::is_same<T, MonthDayNanoIntervalType>::value>> {
using type = PyPrimitiveConverter<T>;
};

template <typename T>
struct PyConverterTrait<T, enable_if_binary_view_like<T>> {
// not implemented
};

template <typename T>
struct PyConverterTrait<T, enable_if_list_like<T>> {
using type = PyListConverter<T>;
Expand Down Expand Up @@ -699,11 +697,22 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::
PyBytesView view_;
};

template <typename T, typename Enable = void>
struct OffsetTypeTrait {
using type = typename T::offset_type;
};

template <typename T>
struct OffsetTypeTrait<T, enable_if_binary_view_like<T>> {
using type = int64_t;
};

template <typename T>
class PyPrimitiveConverter<T, enable_if_base_binary<T>>
class PyPrimitiveConverter<
T, enable_if_t<is_base_binary_type<T>::value || is_binary_view_like_type<T>::value>>
: public PrimitiveConverter<T, PyConverter> {
public:
using OffsetType = typename T::offset_type;
using OffsetType = typename OffsetTypeTrait<T>::type;

Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
Expand Down
19 changes: 16 additions & 3 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,16 @@ def test_sequence_unicode():
assert arr.to_pylist() == data


@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()])
def test_sequence_unicode_explicit_type(ty):
data = ['foo', 'bar', None, 'mañana']
arr = pa.array(data, type=ty)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == ty
assert arr.to_pylist() == data


def check_array_mixed_unicode_bytes(binary_type, string_type):
values = ['qux', b'foo', bytearray(b'barz')]
b_values = [b'qux', b'foo', b'barz']
Expand All @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type):
def test_array_mixed_unicode_bytes():
check_array_mixed_unicode_bytes(pa.binary(), pa.string())
check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view())


@pytest.mark.large_memory
Expand Down Expand Up @@ -818,7 +829,7 @@ def test_large_binary_value(ty):


@pytest.mark.large_memory
@pytest.mark.parametrize("ty", [pa.binary(), pa.string()])
@pytest.mark.parametrize("ty", [pa.binary(), pa.string(), pa.string_view()])
def test_string_too_large(ty):
# Construct a binary array with a single value larger than 4GB
s = b"0123456789abcdefghijklmnopqrstuvwxyz"
Expand All @@ -836,15 +847,15 @@ def test_sequence_bytes():
u1.decode('utf-8'), # unicode gets encoded,
bytearray(b'bar'),
None]
for ty in [None, pa.binary(), pa.large_binary()]:
for ty in [None, pa.binary(), pa.large_binary(), pa.binary_view()]:
arr = pa.array(data, type=ty)
assert len(arr) == 6
assert arr.null_count == 1
assert arr.type == ty or pa.binary()
assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None]


@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()])
def test_sequence_utf8_to_unicode(ty):
# ARROW-1225
data = [b'foo', None, b'bar']
Expand Down Expand Up @@ -2431,6 +2442,8 @@ def test_array_from_pylist_offset_overflow():
pa.binary(3)),
([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()),
(["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()),
([b"a"], [pa.scalar("a", type=pa.binary_view())], pa.binary_view()),
(["a"], [pa.scalar("a", type=pa.string_view())], pa.string_view()),
(
["a"],
[pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))],
Expand Down
28 changes: 4 additions & 24 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@
(b"bytes", None, pa.BinaryScalar),
("largestring", pa.large_string(), pa.LargeStringScalar),
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
# TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented
# ("string_view", pa.string_view(), pa.StringViewScalar),
# (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
("string_view", pa.string_view(), pa.StringViewScalar),
(b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
([1, 2, 3], None, pa.ListScalar),
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
Expand Down Expand Up @@ -492,7 +491,7 @@ def test_month_day_nano_interval():
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.string(), pa.StringScalar),
(pa.large_string(), pa.LargeStringScalar),
# (pa.string_view(), pa.StringViewScalar),
(pa.string_view(), pa.StringViewScalar),
])
def test_string(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand All @@ -507,30 +506,11 @@ def test_string(value, ty, scalar_typ):
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', ['foo', 'mañana'])
def test_string_view(value):
# TODO: replace with normal scalar construction
builder = pa.lib.StringViewBuilder()
builder.append(value)
arr = builder.finish()

s = arr[0]
assert isinstance(s, pa.StringViewScalar)
assert s.as_py() == value
assert s.as_py() != 'something'
assert repr(value) in repr(s)
assert str(s) == str(value)

buf = s.as_buffer()
assert isinstance(buf, pa.Buffer)
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.binary(), pa.BinaryScalar),
(pa.large_binary(), pa.LargeBinaryScalar),
# (pa.binary_view(), pa.BinaryViewScalar),
(pa.binary_view(), pa.BinaryViewScalar),
])
def test_binary(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand Down
Loading