Skip to content

Commit

Permalink
[Python] Append struct after appending children
Browse files Browse the repository at this point in the history
During conversion from Python to Arrow, when a struct's child
hits a capacity error and chunking is triggered, this can leave
the Finish'd chunk in an invalid state since the struct's length
does not match the length of its children.

This change simply tries to Append the children first, and only
if successful will Append the struct. This is safe because the
order of Append'ing between the struct and its child is not
specified. It is only specified that they must be consistent
with each other.
  • Loading branch information
Mike Lui committed Aug 27, 2023
1 parent c9bfca5 commit 7aa875a
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 6 deletions.
16 changes: 10 additions & 6 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
}
switch (input_kind_) {
case InputKind::DICT:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendDict(value);
RETURN_NOT_OK(AppendDict(value));
return this->struct_builder_->Append();
case InputKind::TUPLE:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendTuple(value);
RETURN_NOT_OK(AppendTuple(value));
return this->struct_builder_->Append();
case InputKind::ITEMS:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendItems(value);
RETURN_NOT_OK(AppendItems(value));
return this->struct_builder_->Append();
default:
RETURN_NOT_OK(InferInputKind(value));
return Append(value);
Expand All @@ -944,6 +944,10 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
Status Init(MemoryPool* pool) override {
RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));

// This implementation will check the child values before appending itself,
// so no rewind is necessary
this->rewind_on_overflow_ = false;

// Store the field names as a PyObjects for dict matching
num_fields_ = this->struct_type_->num_fields();
bytes_field_names_.reset(PyList_New(num_fields_));
Expand Down
59 changes: 59 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4936,3 +4936,62 @@ def test_array_conversion_for_datetime():

result = arr.to_pandas()
tm.assert_series_equal(result, series)


@pytest.mark.large_memory
def test_nested_chunking_valid():
# GH-32439
# https://github.com/apache/arrow/issues/32439
# Chunking can cause arrays to be in invalid state
# when nested types are involved.
# Here we simply ensure we validate correctly.

x = "0" * 720000000
df = pd.DataFrame({"strings": [x, x, x]})
tab = pa.Table.from_pandas(df)
# we expect to trigger chunking internally
# an assertion failure here may just mean this threshold has changed
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

struct = {"struct_field": x}
df = pd.DataFrame({"structs": [struct, struct, struct]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

lists = [x]
df = pd.DataFrame({"lists": [lists, lists, lists]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

los = [struct]
df = pd.DataFrame({"los": [los, los, los]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

sol = {"struct_field": lists}
df = pd.DataFrame({"sol": [sol, sol, sol]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

map_of_los = {"a": los}
df = pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]})
tab = pa.Table.from_pandas(
df,
schema=pa.schema(
[(
"maps",
pa.map_(
pa.string(),
pa.list_(
pa.struct([pa.field("struct_field", pa.string())])
)
)
)]))
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(
self_destruct=True, maps_as_pydicts="strict"), df)

0 comments on commit 7aa875a

Please sign in to comment.