apacheGH-32439: [Python] Fix off by one bug when chunking nested stru…

…cts (apache#37376) ### Rationale for this change See: apache#32439 ### What changes are included in this PR? During conversion from Python to Arrow, when a struct's child hits a capacity error and chunking is triggered, this can leave the Finish'd chunk in an invalid state since the struct's length does not match the length of its children. This change simply tries to Append the children first, and only if successful will Append the struct. This is safe because the order of Append'ing between the struct and its child is not specified. It is only specified that they must be consistent with each other. This is per: https://github.com/apache/arrow/blob/86b7a84c9317fa08222eb63f6930bbb54c2e6d0b/cpp/src/arrow/array/builder_nested.h#L507-L508 ### Are these changes tested? A unit test is added that would previously have an invalid data error. ``` > tab = pa.Table.from_pandas(df) pyarrow/tests/test_pandas.py:4970: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pyarrow/table.pxi:3788: in pyarrow.lib.Table.from_pandas return cls.from_arrays(arrays, schema=schema) pyarrow/table.pxi:3890: in pyarrow.lib.Table.from_arrays result.validate() pyarrow/table.pxi:3170: in pyarrow.lib.Table.validate check_status(self.table.Validate()) # ... FAILED pyarrow/tests/test_pandas.py::test_nested_chunking_valid - pyarrow.lib.ArrowInvalid: Column 0: In chunk 0: Invalid: List child array invalid: Invalid: Struct child array #0 has length smaller than expected for struct array (2 < 3) ``` NOTE: This unit test uses about 7GB of memory (max RSS) on my macbook pro. This might make CI challenging; I'm open to suggestions to limit it. ### Are there any user-facing changes? No * Closes: apache#32439 Lead-authored-by: Mike Lui <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
dgreiss · Feb 17, 2024 · 2ec7b06 · 2ec7b06
1 parent 3a36072
commit 2ec7b06
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 6 deletions.
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
     }
     switch (input_kind_) {
       case InputKind::DICT:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendDict(value);
+        RETURN_NOT_OK(AppendDict(value));
+        return this->struct_builder_->Append();
       case InputKind::TUPLE:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendTuple(value);
+        RETURN_NOT_OK(AppendTuple(value));
+        return this->struct_builder_->Append();
       case InputKind::ITEMS:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendItems(value);
+        RETURN_NOT_OK(AppendItems(value));
+        return this->struct_builder_->Append();
       default:
         RETURN_NOT_OK(InferInputKind(value));
         return Append(value);
@@ -944,6 +944,10 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
   Status Init(MemoryPool* pool) override {
     RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));
 
+    // This implementation will check the child values before appending itself,
+    // so no rewind is necessary
+    this->rewind_on_overflow_ = false;
+
     // Store the field names as a PyObjects for dict matching
     num_fields_ = this->struct_type_->num_fields();
     bytes_field_names_.reset(PyList_New(num_fields_));

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -1691,6 +1691,7 @@ def test_auto_chunking_pandas_series_of_strings(self, char):
             'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
         })
         arr = pa.array(df['strings'], from_pandas=True)
+        arr.validate(full=True)
         assert isinstance(arr, pa.ChunkedArray)
         assert arr.num_chunks == 2
         assert len(arr.chunk(0)) == 21
@@ -2381,6 +2382,7 @@ def test_auto_chunking_on_list_overflow(self):
             "b": range(n)
         })
         table = pa.Table.from_pandas(df)
+        table.validate(full=True)
 
         column_a = table[0]
         assert column_a.num_chunks == 2
@@ -2623,6 +2625,7 @@ def test_from_numpy_large(self):
         ty = pa.struct([pa.field('x', pa.float64()),
                         pa.field('y', pa.binary())])
         arr = pa.array(data, type=ty, from_pandas=True)
+        arr.validate(full=True)
         assert arr.num_chunks == 2
 
         def iter_chunked_array(arr):
@@ -2655,6 +2658,7 @@ def check(arr, data, mask=None):
         # Now with explicit mask
         mask = np.random.random_sample(n) < 0.2
         arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
+        arr.validate(full=True)
         assert arr.num_chunks == 2
 
         check(arr, data, mask)
@@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():
 
     def assert_roundtrip(series: pd.Series, data) -> None:
         array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty))
+        array_roundtrip.validate(full=True)
         assert data.equals(array_roundtrip)
 
     assert_roundtrip(series_default, chunked_array)
@@ -4944,3 +4949,42 @@ def test_array_conversion_for_datetime():
 
     result = arr.to_pandas()
     tm.assert_series_equal(result, series)
+
+
+@pytest.mark.large_memory
+def test_nested_chunking_valid():
+    # GH-32439: Chunking can cause arrays to be in invalid state
+    # when nested types are involved.
+    # Here we simply ensure we validate correctly.
+
+    def roundtrip(df, schema=None):
+        tab = pa.Table.from_pandas(df, schema=schema)
+        tab.validate(full=True)
+        # we expect to trigger chunking internally
+        # an assertion failure here may just mean this threshold has changed
+        num_chunks = tab.column(0).num_chunks
+        assert num_chunks > 1
+        tm.assert_frame_equal(tab.to_pandas(self_destruct=True,
+                                            maps_as_pydicts="strict"), df)
+
+    x = b"0" * 720000000
+    roundtrip(pd.DataFrame({"strings": [x, x, x]}))
+
+    struct = {"struct_field": x}
+    roundtrip(pd.DataFrame({"structs": [struct, struct, struct]}))
+
+    lists = [x]
+    roundtrip(pd.DataFrame({"lists": [lists, lists, lists]}))
+
+    los = [struct]
+    roundtrip(pd.DataFrame({"los": [los, los, los]}))
+
+    sol = {"struct_field": lists}
+    roundtrip(pd.DataFrame({"sol": [sol, sol, sol]}))
+
+    map_of_los = {"a": los}
+    map_type = pa.map_(pa.string(),
+                       pa.list_(pa.struct([("struct_field", pa.binary())])))
+    schema = pa.schema([("maps", map_type)])
+    roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
+              schema=schema)