Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-32439: [Python] Fix off by one bug when chunking nested structs #37376

Merged
merged 2 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
}
switch (input_kind_) {
case InputKind::DICT:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendDict(value);
RETURN_NOT_OK(AppendDict(value));
return this->struct_builder_->Append();
case InputKind::TUPLE:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendTuple(value);
RETURN_NOT_OK(AppendTuple(value));
return this->struct_builder_->Append();
case InputKind::ITEMS:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendItems(value);
RETURN_NOT_OK(AppendItems(value));
return this->struct_builder_->Append();
default:
RETURN_NOT_OK(InferInputKind(value));
return Append(value);
Expand All @@ -944,6 +944,10 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
Status Init(MemoryPool* pool) override {
RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));

// This implementation will check the child values before appending itself,
// so no rewind is necessary
this->rewind_on_overflow_ = false;

// Store the field names as a PyObjects for dict matching
num_fields_ = this->struct_type_->num_fields();
bytes_field_names_.reset(PyList_New(num_fields_));
Expand Down
44 changes: 44 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1691,6 +1691,7 @@ def test_auto_chunking_pandas_series_of_strings(self, char):
'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
})
arr = pa.array(df['strings'], from_pandas=True)
arr.validate(full=True)
assert isinstance(arr, pa.ChunkedArray)
assert arr.num_chunks == 2
assert len(arr.chunk(0)) == 21
Expand Down Expand Up @@ -2381,6 +2382,7 @@ def test_auto_chunking_on_list_overflow(self):
"b": range(n)
})
table = pa.Table.from_pandas(df)
table.validate(full=True)

column_a = table[0]
assert column_a.num_chunks == 2
Expand Down Expand Up @@ -2623,6 +2625,7 @@ def test_from_numpy_large(self):
ty = pa.struct([pa.field('x', pa.float64()),
pa.field('y', pa.binary())])
arr = pa.array(data, type=ty, from_pandas=True)
arr.validate(full=True)
assert arr.num_chunks == 2

def iter_chunked_array(arr):
Expand Down Expand Up @@ -2655,6 +2658,7 @@ def check(arr, data, mask=None):
# Now with explicit mask
mask = np.random.random_sample(n) < 0.2
arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
arr.validate(full=True)
assert arr.num_chunks == 2

check(arr, data, mask)
Expand Down Expand Up @@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():

def assert_roundtrip(series: pd.Series, data) -> None:
array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty))
array_roundtrip.validate(full=True)
assert data.equals(array_roundtrip)

assert_roundtrip(series_default, chunked_array)
Expand Down Expand Up @@ -4944,3 +4949,42 @@ def test_array_conversion_for_datetime():

result = arr.to_pandas()
tm.assert_series_equal(result, series)


@pytest.mark.large_memory
def test_nested_chunking_valid():
# GH-32439: Chunking can cause arrays to be in invalid state
# when nested types are involved.
# Here we simply ensure we validate correctly.

def roundtrip(df, schema=None):
tab = pa.Table.from_pandas(df, schema=schema)
tab.validate(full=True)
# we expect to trigger chunking internally
# an assertion failure here may just mean this threshold has changed
num_chunks = tab.column(0).num_chunks
assert num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True,
maps_as_pydicts="strict"), df)

x = b"0" * 720000000
roundtrip(pd.DataFrame({"strings": [x, x, x]}))

struct = {"struct_field": x}
roundtrip(pd.DataFrame({"structs": [struct, struct, struct]}))

lists = [x]
roundtrip(pd.DataFrame({"lists": [lists, lists, lists]}))

los = [struct]
roundtrip(pd.DataFrame({"los": [los, los, los]}))

sol = {"struct_field": lists}
roundtrip(pd.DataFrame({"sol": [sol, sol, sol]}))

map_of_los = {"a": los}
map_type = pa.map_(pa.string(),
pa.list_(pa.struct([("struct_field", pa.binary())])))
schema = pa.schema([("maps", map_type)])
roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
schema=schema)