Skip to content

Commit

Permalink
apacheGH-40092: [Python] Support Binary/StringView conversion to nump…
Browse files Browse the repository at this point in the history
…y/pandas (apache#40093)

Last step for Binary/StringView support in Python (apache#39633), now adding it to the arrow->pandas/numpy conversion code path.
* Closes: apache#40092

Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
jorisvandenbossche authored and zanmato1984 committed Feb 28, 2024
1 parent 5021ff0 commit 0891996
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
22 changes: 19 additions & 3 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ struct WrapBytes<LargeStringType> {
}
};

template <>
struct WrapBytes<StringViewType> {
static inline PyObject* Wrap(const char* data, int64_t length) {
return PyUnicode_FromStringAndSize(data, length);
}
};

template <>
struct WrapBytes<BinaryType> {
static inline PyObject* Wrap(const char* data, int64_t length) {
Expand All @@ -147,6 +154,13 @@ struct WrapBytes<LargeBinaryType> {
}
};

template <>
struct WrapBytes<BinaryViewType> {
static inline PyObject* Wrap(const char* data, int64_t length) {
return PyBytes_FromStringAndSize(data, length);
}
};

template <>
struct WrapBytes<FixedSizeBinaryType> {
static inline PyObject* Wrap(const char* data, int64_t length) {
Expand Down Expand Up @@ -1154,7 +1168,8 @@ struct ObjectWriterVisitor {
}

template <typename Type>
enable_if_t<is_base_binary_type<Type>::value || is_fixed_size_binary_type<Type>::value,
enable_if_t<is_base_binary_type<Type>::value || is_binary_view_like_type<Type>::value ||
is_fixed_size_binary_type<Type>::value,
Status>
Visit(const Type& type) {
auto WrapValue = [](const std::string_view& view, PyObject** out) {
Expand Down Expand Up @@ -1355,8 +1370,7 @@ struct ObjectWriterVisitor {
std::is_same<ExtensionType, Type>::value ||
(std::is_base_of<IntervalType, Type>::value &&
!std::is_same<MonthDayNanoIntervalType, Type>::value) ||
std::is_base_of<UnionType, Type>::value ||
std::is_base_of<BinaryViewType, Type>::value,
std::is_base_of<UnionType, Type>::value,
Status>
Visit(const Type& type) {
return Status::NotImplemented("No implemented conversion to object dtype: ",
Expand Down Expand Up @@ -2086,8 +2100,10 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
break;
case Type::STRING: // fall through
case Type::LARGE_STRING: // fall through
case Type::STRING_VIEW: // fall through
case Type::BINARY: // fall through
case Type::LARGE_BINARY:
case Type::BINARY_VIEW:
case Type::NA: // fall through
case Type::FIXED_SIZE_BINARY: // fall through
case Type::STRUCT: // fall through
Expand Down
14 changes: 14 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,20 @@ def test_large_string(self):
_check_pandas_roundtrip(
df, schema=pa.schema([('a', pa.large_string())]))

def test_binary_view(self):
s = pd.Series([b'123', b'', b'a', None])
_check_series_roundtrip(s, type_=pa.binary_view())
df = pd.DataFrame({'a': s})
_check_pandas_roundtrip(
df, schema=pa.schema([('a', pa.binary_view())]))

def test_string_view(self):
s = pd.Series(['123', '', 'a', None])
_check_series_roundtrip(s, type_=pa.string_view())
df = pd.DataFrame({'a': s})
_check_pandas_roundtrip(
df, schema=pa.schema([('a', pa.string_view())]))

def test_table_empty_str(self):
values = ['', '', '', '', '']
df = pd.DataFrame({'strings': values})
Expand Down

0 comments on commit 0891996

Please sign in to comment.