From 8622dd3bdfd2ea407dca8b06d06cf2ac815efa43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jan 2024 14:41:50 +0100 Subject: [PATCH 1/2] GH-39852: [Python] Support creating Binary/StringView arrays from python objects --- .../src/arrow/python/python_to_arrow.cc | 64 +++++++++++++++---- python/pyarrow/tests/test_convert_builtin.py | 19 +++++- python/pyarrow/tests/test_scalars.py | 28 ++------ 3 files changed, 73 insertions(+), 38 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index d1d94ac17a13e..595fdc2708abe 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -486,6 +486,10 @@ class PyValue { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -499,8 +503,8 @@ class PyValue { } template - static enable_if_string Convert(const T*, const O& options, I obj, - PyBytesView& view) { + static enable_if_t::value || is_string_view_type::value, Status> + Convert(const T*, const O& options, I obj, PyBytesView& view) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 @@ -570,18 +574,12 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, - enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value && !is_binary_view_like_type::value) || - std::is_same::value>> { + T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; -template -struct PyConverterTrait> { - // not implemented -}; - template struct PyConverterTrait> { using type = PyListConverter; @@ -745,6 +743,50 @@ class PyPrimitiveConverter> bool observed_binary_ = false; }; +template +class PyPrimitiveConverter::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + if (!view_.is_utf8) { + // observed binary value + observed_binary_ = true; + } + // Since we don't know the varying length input size in advance, we need to + // reserve space in the value builder one by one. ReserveData raises CapacityError + // if the value would not fit into the array. + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); + } + return Status::OK(); + } + + Result> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); + if (observed_binary_) { + // if we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits::type_singleton(); + return array->View(binary_type); + } else { + return array; + } + } + + protected: + PyBytesView view_; + bool observed_binary_ = false; +}; + template class PyDictionaryConverter> : public DictionaryConverter { diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 49c4f1a6e79d6..55ea28f50fbb3 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -763,6 +763,16 @@ def test_sequence_unicode(): assert arr.to_pylist() == data +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) +def test_sequence_unicode_explicit_type(ty): + data = ['foo', 'bar', None, 'mañana'] + arr = pa.array(data, type=ty) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == ty + assert arr.to_pylist() == data + + def check_array_mixed_unicode_bytes(binary_type, string_type): values = ['qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type): def test_array_mixed_unicode_bytes(): check_array_mixed_unicode_bytes(pa.binary(), pa.string()) check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) + check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view()) @pytest.mark.large_memory @@ -818,7 +829,7 @@ def test_large_binary_value(ty): @pytest.mark.large_memory -@pytest.mark.parametrize("ty", [pa.binary(), pa.string()]) +@pytest.mark.parametrize("ty", [pa.binary(), pa.string(), pa.string_view()]) def test_string_too_large(ty): # Construct a binary array with a single value larger than 4GB s = b"0123456789abcdefghijklmnopqrstuvwxyz" @@ -836,7 +847,7 @@ def test_sequence_bytes(): u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] - for ty in [None, pa.binary(), pa.large_binary()]: + for ty in [None, pa.binary(), pa.large_binary(), pa.binary_view()]: arr = pa.array(data, type=ty) assert len(arr) == 6 assert arr.null_count == 1 @@ -844,7 +855,7 @@ def test_sequence_bytes(): assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None] -@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()]) +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) def test_sequence_utf8_to_unicode(ty): # ARROW-1225 data = [b'foo', None, b'bar'] @@ -2431,6 +2442,8 @@ def test_array_from_pylist_offset_overflow(): pa.binary(3)), ([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()), (["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()), + ([b"a"], [pa.scalar("a", type=pa.binary_view())], pa.binary_view()), + (["a"], [pa.scalar("a", type=pa.string_view())], pa.string_view()), ( ["a"], [pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))], diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 4a239b23d5676..eed5f045be945 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -51,9 +51,8 @@ (b"bytes", None, pa.BinaryScalar), ("largestring", pa.large_string(), pa.LargeStringScalar), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar), - # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented - # ("string_view", pa.string_view(), pa.StringViewScalar), - # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), + ("string_view", pa.string_view(), pa.StringViewScalar), + (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar), ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), @@ -492,7 +491,7 @@ def test_month_day_nano_interval(): @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), (pa.large_string(), pa.LargeStringScalar), - # (pa.string_view(), pa.StringViewScalar), + (pa.string_view(), pa.StringViewScalar), ]) def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -507,30 +506,11 @@ def test_string(value, ty, scalar_typ): assert buf.to_pybytes() == value.encode() -@pytest.mark.parametrize('value', ['foo', 'mañana']) -def test_string_view(value): - # TODO: replace with normal scalar construction - builder = pa.lib.StringViewBuilder() - builder.append(value) - arr = builder.finish() - - s = arr[0] - assert isinstance(s, pa.StringViewScalar) - assert s.as_py() == value - assert s.as_py() != 'something' - assert repr(value) in repr(s) - assert str(s) == str(value) - - buf = s.as_buffer() - assert isinstance(buf, pa.Buffer) - assert buf.to_pybytes() == value.encode() - - @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.binary(), pa.BinaryScalar), (pa.large_binary(), pa.LargeBinaryScalar), - # (pa.binary_view(), pa.BinaryViewScalar), + (pa.binary_view(), pa.BinaryViewScalar), ]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) From 8de3cfc52a5fe9547f4ffc58bd04cbe9349701c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jan 2024 18:16:25 +0100 Subject: [PATCH 2/2] reuse base binary converter with templating --- .../src/arrow/python/python_to_arrow.cc | 57 ++++--------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 595fdc2708abe..3c4d59d6594a2 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -697,56 +697,23 @@ class PyPrimitiveConverter:: PyBytesView view_; }; -template -class PyPrimitiveConverter> - : public PrimitiveConverter { - public: - using OffsetType = typename T::offset_type; - - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - this->primitive_builder_->UnsafeAppendNull(); - } else if (arrow::py::is_scalar(value)) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, - arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); - } else { - ARROW_RETURN_NOT_OK( - PyValue::Convert(this->primitive_type_, this->options_, value, view_)); - if (!view_.is_utf8) { - // observed binary value - observed_binary_ = true; - } - // Since we don't know the varying length input size in advance, we need to - // reserve space in the value builder one by one. ReserveData raises CapacityError - // if the value would not fit into the array. - ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); - this->primitive_builder_->UnsafeAppend(view_.bytes, - static_cast(view_.size)); - } - return Status::OK(); - } - - Result> ToArray() override { - ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); - if (observed_binary_) { - // if we saw any non-unicode, cast results to BinaryArray - auto binary_type = TypeTraits::type_singleton(); - return array->View(binary_type); - } else { - return array; - } - } +template +struct OffsetTypeTrait { + using type = typename T::offset_type; +}; - protected: - PyBytesView view_; - bool observed_binary_ = false; +template +struct OffsetTypeTrait> { + using type = int64_t; }; template -class PyPrimitiveConverter::value>> +class PyPrimitiveConverter< + T, enable_if_t::value || is_binary_view_like_type::value>> : public PrimitiveConverter { public: + using OffsetType = typename OffsetTypeTrait::type; + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); @@ -766,7 +733,7 @@ class PyPrimitiveConverter::value>> // if the value would not fit into the array. ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); this->primitive_builder_->UnsafeAppend(view_.bytes, - static_cast(view_.size)); + static_cast(view_.size)); } return Status::OK(); }