From 1d1d8e513b0c2afb198aaa2cd4a82d633499ca84 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Thu, 14 Mar 2024 16:51:11 -0400 Subject: [PATCH] GH-40316: [Python] only allocate the ScalarMemoTable when used --- .../src/arrow/python/arrow_to_pandas.cc | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 023ba5585e704..bad489c9fdebe 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -622,37 +622,42 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra using ArrayType = typename TypeTraits::ArrayType; using Scalar = typename MemoizationTraits::Scalar; - ::arrow::internal::ScalarMemoTable memo_table(options.pool); - std::vector unique_values; - int32_t memo_size = 0; - - auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { - int32_t memo_index; - RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); - if (memo_index == memo_size) { - // New entry - RETURN_NOT_OK(wrap_func(value, out_values)); - unique_values.push_back(*out_values); - ++memo_size; - } else { - // Duplicate entry - Py_INCREF(unique_values[memo_index]); - *out_values = unique_values[memo_index]; - } - return Status::OK(); - }; - - auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { - return wrap_func(value, out_values); - }; + std::shared_ptr<::arrow::internal::ScalarMemoTable> memo_table = nullptr; + std::shared_ptr> unique_values = nullptr; + std::shared_ptr memo_size = std::make_shared(0); + + std::function::Scalar&, PyObject**)> + WrapFunc; + + if (options.deduplicate_objects) { + memo_table = + std::make_shared<::arrow::internal::ScalarMemoTable>(options.pool); + unique_values = std::make_shared>(); + + WrapFunc = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index; + RETURN_NOT_OK(memo_table->GetOrInsert(value, &memo_index)); + if (memo_index == *memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values->push_back(*out_values); + ++(*memo_size); + } else { + // Duplicate entry + Py_INCREF((*unique_values)[memo_index]); + *out_values = (*unique_values)[memo_index]; + } + return Status::OK(); + }; + } else { + WrapFunc = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + } for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); - if (options.deduplicate_objects) { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values)); - } else { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values)); - } + RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapFunc, out_values)); out_values += arr.length(); } return Status::OK();