From 6c386dab6760961160ddbfe7dcb6952943920828 Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Wed, 15 May 2024 01:10:22 +0800
Subject: [PATCH] GH-41334: [C++][Acero] Use per-node basis temp vector stack
 to mitigate overflow (#41335)

### Rationale for this change

The risk of temp vector stack overflow still exists as described in #41334 . Many people have agreed on a per-node basis approach:
> 1) it doesn't introduce more performance penalty than shared stack; 2) it can mitigate the overflow in a natural way, i.e., expanding the stack size linear to the number of nodes; 3) it requires no more complexity to the existing stack implementation.

The full (but long) story is also revealed in the subsequent discussion of this PR. Feel free to scroll down.

### What changes are included in this PR?

1. Change the current shared (per-thread) temp vector stack usage to per-node basis.
2. Make the stack size required by each stack user more explicit.

### Are these changes tested?

UT included.

### Are there any user-facing changes?

None.

* GitHub Issue: #41334

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                 |  3 +-
 cpp/src/arrow/acero/exec_plan.cc             |  2 +-
 cpp/src/arrow/acero/hash_join_node.cc        | 38 +++++++---
 cpp/src/arrow/acero/hash_join_node_test.cc   | 52 +++++++++++++
 cpp/src/arrow/acero/query_context.cc         | 12 +--
 cpp/src/arrow/acero/query_context.h          |  8 +-
 cpp/src/arrow/acero/swiss_join.cc            | 16 ++--
 cpp/src/arrow/compute/key_hash_internal.h    | 19 +++++
 cpp/src/arrow/compute/key_hash_test.cc       | 59 ++++++++++++++-
 cpp/src/arrow/compute/key_map_internal.h     |  1 +
 cpp/src/arrow/compute/light_array_internal.h |  1 +
 cpp/src/arrow/compute/light_array_test.cc    |  1 +
 cpp/src/arrow/compute/row/compare_internal.h | 10 +++
 cpp/src/arrow/compute/row/compare_test.cc    | 62 ++++++++++++++-
 cpp/src/arrow/compute/row/grouper.cc         |  1 +
 cpp/src/arrow/compute/util.cc                | 31 --------
 cpp/src/arrow/compute/util.h                 | 73 ------------------
 cpp/src/arrow/compute/util_internal.cc       | 79 ++++++++++++++++++++
 cpp/src/arrow/compute/util_internal.h        | 53 +++++++++++++
 19 files changed, 371 insertions(+), 150 deletions(-)
 create mode 100644 cpp/src/arrow/compute/util_internal.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 5d61112518f5e..0f4824ec99daa 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -716,7 +716,8 @@ set(ARROW_COMPUTE_SRCS
     compute/row/compare_internal.cc
     compute/row/grouper.cc
     compute/row/row_internal.cc
-    compute/util.cc)
+    compute/util.cc
+    compute/util_internal.cc)
 
 append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc)
 append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc)
diff --git a/cpp/src/arrow/acero/exec_plan.cc b/cpp/src/arrow/acero/exec_plan.cc
index 97119726d4b17..d9fb1942fccd8 100644
--- a/cpp/src/arrow/acero/exec_plan.cc
+++ b/cpp/src/arrow/acero/exec_plan.cc
@@ -128,7 +128,7 @@ struct ExecPlanImpl : public ExecPlan {
     Future<> scheduler_finished = arrow::util::AsyncTaskScheduler::Make(
         [this](arrow::util::AsyncTaskScheduler* async_scheduler) {
           QueryContext* ctx = query_context();
-          RETURN_NOT_OK(ctx->Init(ctx->max_concurrency(), async_scheduler));
+          RETURN_NOT_OK(ctx->Init(async_scheduler));
 
 #ifdef ARROW_WITH_OPENTELEMETRY
           if (HasMetadata()) {
diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc
index b49364300dac8..06405f16c8d4c 100644
--- a/cpp/src/arrow/acero/hash_join_node.cc
+++ b/cpp/src/arrow/acero/hash_join_node.cc
@@ -497,11 +497,11 @@ struct BloomFilterPushdownContext {
   using BuildFinishedCallback = std::function<Status(size_t, AccumulationQueue)>;
   using FiltersReceivedCallback = std::function<Status(size_t)>;
   using FilterFinishedCallback = std::function<Status(size_t, AccumulationQueue)>;
-  void Init(HashJoinNode* owner, size_t num_threads,
-            RegisterTaskGroupCallback register_task_group_callback,
-            StartTaskGroupCallback start_task_group_callback,
-            FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter,
-            bool use_sync_execution);
+  Status Init(HashJoinNode* owner, size_t num_threads,
+              RegisterTaskGroupCallback register_task_group_callback,
+              StartTaskGroupCallback start_task_group_callback,
+              FiltersReceivedCallback on_bloom_filters_received,
+              bool disable_bloom_filter, bool use_sync_execution);
 
   Status StartProducing(size_t thread_index);
 
@@ -559,8 +559,7 @@ struct BloomFilterPushdownContext {
     std::vector<uint32_t> hashes(batch.length);
     std::vector<uint8_t> bv(bit_vector_bytes);
 
-    ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack,
-                          ctx_->GetTempStack(thread_index));
+    arrow::util::TempVectorStack* stack = &tld_[thread_index].stack;
 
     // Start with full selection for the current batch
     memset(selected.data(), 0xff, bit_vector_bytes);
@@ -654,7 +653,17 @@ struct BloomFilterPushdownContext {
     FiltersReceivedCallback all_received_callback_;
     FilterFinishedCallback on_finished_;
   } eval_;
+
+  static constexpr auto kTempStackUsage =
+      Hashing32::kHashBatchTempStackUsage +
+      (sizeof(uint32_t) + /*extra=*/1) * arrow::util::MiniBatch::kMiniBatchLength;
+
+  struct ThreadLocalData {
+    arrow::util::TempVectorStack stack;
+  };
+  std::vector<ThreadLocalData> tld_;
 };
+
 bool HashJoinSchema::HasDictionaries() const {
   for (int side = 0; side <= 1; ++side) {
     for (int icol = 0; icol < proj_maps[side].num_cols(HashJoinProjection::INPUT);
@@ -930,7 +939,7 @@ class HashJoinNode : public ExecNode, public TracedNode {
     // we will change it back to just the CPU's thread pool capacity.
     size_t num_threads = (GetCpuThreadPoolCapacity() + io::GetIOThreadPoolCapacity() + 1);
 
-    pushdown_context_.Init(
+    RETURN_NOT_OK(pushdown_context_.Init(
         this, num_threads,
         [ctx](std::function<Status(size_t, int64_t)> fn,
               std::function<Status(size_t)> on_finished) {
@@ -940,7 +949,7 @@ class HashJoinNode : public ExecNode, public TracedNode {
           return ctx->StartTaskGroup(task_group_id, num_tasks);
         },
         [this](size_t thread_index) { return OnFiltersReceived(thread_index); },
-        disable_bloom_filter_, use_sync_execution);
+        disable_bloom_filter_, use_sync_execution));
 
     RETURN_NOT_OK(impl_->Init(
         ctx, join_type_, num_threads, &(schema_mgr_->proj_maps[0]),
@@ -1037,7 +1046,7 @@ class HashJoinNode : public ExecNode, public TracedNode {
   BloomFilterPushdownContext pushdown_context_;
 };
 
-void BloomFilterPushdownContext::Init(
+Status BloomFilterPushdownContext::Init(
     HashJoinNode* owner, size_t num_threads,
     RegisterTaskGroupCallback register_task_group_callback,
     StartTaskGroupCallback start_task_group_callback,
@@ -1074,6 +1083,12 @@ void BloomFilterPushdownContext::Init(
         return eval_.on_finished_(thread_index, std::move(eval_.batches_));
       });
   start_task_group_callback_ = std::move(start_task_group_callback);
+  tld_.resize(num_threads);
+  for (auto& local_data : tld_) {
+    RETURN_NOT_OK(local_data.stack.Init(ctx_->memory_pool(), kTempStackUsage));
+  }
+
+  return Status::OK();
 }
 
 Status BloomFilterPushdownContext::StartProducing(size_t thread_index) {
@@ -1124,8 +1139,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter_exec_task(size_t thread_inde
   }
   ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(std::move(key_columns)));
 
-  ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack,
-                        ctx_->GetTempStack(thread_index));
+  arrow::util::TempVectorStack* stack = &tld_[thread_index].stack;
   arrow::util::TempVectorHolder<uint32_t> hash_holder(
       stack, arrow::util::MiniBatch::kMiniBatchLength);
   uint32_t* hashes = hash_holder.mutable_data();
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index 9c3dbc176ff4f..215b1e4d21125 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -28,6 +28,7 @@
 #include "arrow/api.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/kernels/test_util.h"
+#include "arrow/compute/light_array_internal.h"
 #include "arrow/testing/extension_type.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
@@ -41,6 +42,7 @@ namespace arrow {
 
 using compute::call;
 using compute::default_exec_context;
+using compute::ExecBatchBuilder;
 using compute::ExecSpan;
 using compute::field_ref;
 using compute::SortIndices;
@@ -3201,5 +3203,55 @@ TEST(HashJoin, ChainedIntegerHashJoins) {
   }
 }
 
+// Test that a large number of joins don't overflow the temp vector stack, like GH-39582
+// and GH-39951.
+TEST(HashJoin, ManyJoins) {
+  // The idea of this case is to create many nested join nodes that may possibly cause
+  // recursive usage of temp vector stack. To make sure that the recursion happens:
+  // 1. A left-deep join tree is created so that the left-most (the final probe side)
+  // table will go through all the hash tables from the right side.
+  // 2. Left-outer join is used so that every join will increase the cardinality.
+  // 3. The left-most table contains rows of unique integers from 0 to N.
+  // 4. Each right table at level i contains two rows of integer i, so that the probing of
+  // each level will increase the result by one row.
+  // 5. The left-most table is a single batch of enough rows, so that at each level, the
+  // probing will accumulate enough result rows to have to output to the subsequent level
+  // before finishing the current batch (releasing the buffer allocated on the temp vector
+  // stack), which is essentially the recursive usage of the temp vector stack.
+
+  // A fair number of joins to guarantee temp vector stack overflow before GH-41335.
+  const int num_joins = 64;
+
+  // `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate
+  // before outputting.
+  const int num_left_rows = ExecBatchBuilder::num_rows_max();
+  ASSERT_OK_AND_ASSIGN(
+      auto left_batches,
+      MakeIntegerBatches({[](int row_id) -> int64_t { return row_id; }},
+                         schema({field("l_key", int32())}),
+                         /*num_batches=*/1, /*batch_size=*/num_left_rows));
+  Declaration root{"exec_batch_source",
+                   ExecBatchSourceNodeOptions(std::move(left_batches.schema),
+                                              std::move(left_batches.batches))};
+
+  HashJoinNodeOptions join_opts(JoinType::LEFT_OUTER, /*left_keys=*/{"l_key"},
+                                /*right_keys=*/{"r_key"});
+
+  for (int i = 0; i < num_joins; ++i) {
+    ASSERT_OK_AND_ASSIGN(auto right_batches,
+                         MakeIntegerBatches({[i](int) -> int64_t { return i; }},
+                                            schema({field("r_key", int32())}),
+                                            /*num_batches=*/1, /*batch_size=*/2));
+    Declaration table{"exec_batch_source",
+                      ExecBatchSourceNodeOptions(std::move(right_batches.schema),
+                                                 std::move(right_batches.batches))};
+
+    Declaration new_root{"hashjoin", {std::move(root), std::move(table)}, join_opts};
+    root = std::move(new_root);
+  }
+
+  ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root)));
+}
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/query_context.cc b/cpp/src/arrow/acero/query_context.cc
index a27397d12079d..18beb19ab7f8b 100644
--- a/cpp/src/arrow/acero/query_context.cc
+++ b/cpp/src/arrow/acero/query_context.cc
@@ -40,8 +40,7 @@ QueryContext::QueryContext(QueryOptions opts, ExecContext exec_context)
 const CpuInfo* QueryContext::cpu_info() const { return CpuInfo::GetInstance(); }
 int64_t QueryContext::hardware_flags() const { return cpu_info()->hardware_flags(); }
 
-Status QueryContext::Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler) {
-  tld_.resize(max_num_threads);
+Status QueryContext::Init(util::AsyncTaskScheduler* scheduler) {
   async_scheduler_ = scheduler;
   return Status::OK();
 }
@@ -50,15 +49,6 @@ size_t QueryContext::GetThreadIndex() { return thread_indexer_(); }
 
 size_t QueryContext::max_concurrency() const { return thread_indexer_.Capacity(); }
 
-Result<util::TempVectorStack*> QueryContext::GetTempStack(size_t thread_index) {
-  if (!tld_[thread_index].is_init) {
-    RETURN_NOT_OK(tld_[thread_index].stack.Init(
-        memory_pool(), 32 * util::MiniBatch::kMiniBatchLength * sizeof(uint64_t)));
-    tld_[thread_index].is_init = true;
-  }
-  return &tld_[thread_index].stack;
-}
-
 Result<Future<>> QueryContext::BeginExternalTask(std::string_view name) {
   Future<> completion_future = Future<>::Make();
   if (async_scheduler_->AddSimpleTask([completion_future] { return completion_future; },
diff --git a/cpp/src/arrow/acero/query_context.h b/cpp/src/arrow/acero/query_context.h
index 9ea11679cba05..3eff299439828 100644
--- a/cpp/src/arrow/acero/query_context.h
+++ b/cpp/src/arrow/acero/query_context.h
@@ -38,7 +38,7 @@ class ARROW_ACERO_EXPORT QueryContext {
   QueryContext(QueryOptions opts = {},
                ExecContext exec_context = *default_exec_context());
 
-  Status Init(size_t max_num_threads, arrow::util::AsyncTaskScheduler* scheduler);
+  Status Init(arrow::util::AsyncTaskScheduler* scheduler);
 
   const ::arrow::internal::CpuInfo* cpu_info() const;
   int64_t hardware_flags() const;
@@ -52,7 +52,6 @@ class ARROW_ACERO_EXPORT QueryContext {
 
   size_t GetThreadIndex();
   size_t max_concurrency() const;
-  Result<arrow::util::TempVectorStack*> GetTempStack(size_t thread_index);
 
   /// \brief Start an external task
   ///
@@ -145,11 +144,6 @@ class ARROW_ACERO_EXPORT QueryContext {
   std::unique_ptr<TaskScheduler> task_scheduler_ = TaskScheduler::Make();
 
   ThreadIndexer thread_indexer_;
-  struct ThreadLocalData {
-    bool is_init = false;
-    arrow::util::TempVectorStack stack;
-  };
-  std::vector<ThreadLocalData> tld_;
 
   std::atomic<size_t> in_flight_bytes_to_disk_{0};
 };
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 542e943c4a82b..17c5212697339 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -2470,6 +2470,8 @@ Status JoinProbeProcessor::OnFinished() {
 
 class SwissJoin : public HashJoinImpl {
  public:
+  static constexpr auto kTempStackUsage = 64 * arrow::util::MiniBatch::kMiniBatchLength;
+
   Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads,
               const HashJoinProjectionMaps* proj_map_left,
               const HashJoinProjectionMaps* proj_map_right,
@@ -2513,6 +2515,7 @@ class SwissJoin : public HashJoinImpl {
 
     local_states_.resize(num_threads_);
     for (int i = 0; i < num_threads_; ++i) {
+      RETURN_NOT_OK(local_states_[i].stack.Init(pool_, kTempStackUsage));
       local_states_[i].hash_table_ready = false;
       local_states_[i].num_output_batches = 0;
       local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right);
@@ -2566,8 +2569,7 @@ class SwissJoin : public HashJoinImpl {
 
     ExecBatch keypayload_batch;
     ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch));
-    ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack,
-                          ctx_->GetTempStack(thread_index));
+    arrow::util::TempVectorStack* temp_stack = &local_states_[thread_index].stack;
 
     return CancelIfNotOK(
         probe_processor_.OnNextBatch(thread_index, keypayload_batch, temp_stack,
@@ -2679,8 +2681,7 @@ class SwissJoin : public HashJoinImpl {
             input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol];
       }
     }
-    ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack,
-                          ctx_->GetTempStack(thread_id));
+    arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack;
     RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch(
         static_cast<int64_t>(thread_id), key_batch, no_payload ? nullptr : &payload_batch,
         temp_stack)));
@@ -2715,8 +2716,7 @@ class SwissJoin : public HashJoinImpl {
 
   Status MergeFinished(size_t thread_id) {
     RETURN_NOT_OK(status());
-    ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack,
-                          ctx_->GetTempStack(thread_id));
+    arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack;
     hash_table_build_.FinishPrtnMerge(temp_stack);
     return CancelIfNotOK(OnBuildHashTableFinished(static_cast<int64_t>(thread_id)));
   }
@@ -2771,8 +2771,7 @@ class SwissJoin : public HashJoinImpl {
         std::min((task_id + 1) * kNumRowsPerScanTask, hash_table_.num_rows());
     // Get thread index and related temp vector stack
     //
-    ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack,
-                          ctx_->GetTempStack(thread_id));
+    arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack;
 
     // Split into mini-batches
     //
@@ -2949,6 +2948,7 @@ class SwissJoin : public HashJoinImpl {
   FinishedCallback finished_callback_;
 
   struct ThreadLocalState {
+    arrow::util::TempVectorStack stack;
     JoinResultMaterialize materialize;
     std::vector<KeyColumnArray> temp_column_arrays;
     int64_t num_output_batches;
diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h
index 7d226f52086b1..1f25beb0e1622 100644
--- a/cpp/src/arrow/compute/key_hash_internal.h
+++ b/cpp/src/arrow/compute/key_hash_internal.h
@@ -48,6 +48,16 @@ class ARROW_EXPORT Hashing32 {
   static void HashMultiColumn(const std::vector<KeyColumnArray>& cols, LightContext* ctx,
                               uint32_t* out_hash);
 
+  // Clarify the max temp stack usage for HashBatch, which might be necessary for the
+  // caller to be aware of at compile time to reserve enough stack size in advance. The
+  // HashBatch implementation uses one uint32 temp vector as a buffer for hash, one uint16
+  // temp vector as a buffer for null indices and one uint32 temp vector as a buffer for
+  // null hash, all are of size kMiniBatchLength. Plus extra kMiniBatchLength to cope with
+  // stack padding and aligning.
+  static constexpr auto kHashBatchTempStackUsage =
+      (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + /*extra=*/1) *
+      util::MiniBatch::kMiniBatchLength;
+
   static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
                           std::vector<KeyColumnArray>& column_arrays,
                           int64_t hardware_flags, util::TempVectorStack* temp_stack,
@@ -161,6 +171,15 @@ class ARROW_EXPORT Hashing64 {
   static void HashMultiColumn(const std::vector<KeyColumnArray>& cols, LightContext* ctx,
                               uint64_t* hashes);
 
+  // Clarify the max temp stack usage for HashBatch, which might be necessary for the
+  // caller to be aware of at compile time to reserve enough stack size in advance. The
+  // HashBatch implementation uses one uint16 temp vector as a buffer for null indices and
+  // one uint64 temp vector as a buffer for null hash, all are of size kMiniBatchLength.
+  // Plus extra kMiniBatchLength to cope with stack padding and aligning.
+  static constexpr auto kHashBatchTempStackUsage =
+      (sizeof(uint16_t) + sizeof(uint64_t) + /*extra=*/1) *
+      util::MiniBatch::kMiniBatchLength;
+
   static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
                           std::vector<KeyColumnArray>& column_arrays,
                           int64_t hardware_flags, util::TempVectorStack* temp_stack,
diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc
index 4e5d869cb7db6..fdf6d2125850a 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -25,12 +25,16 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/compute/key_hash_internal.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/pcg_random.h"
 
 namespace arrow {
 
+using arrow::random::RandomArrayGenerator;
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
 using internal::checked_pointer_cast;
 using internal::CpuInfo;
 
@@ -156,7 +160,7 @@ class TestVectorHash {
     std::vector<uint32_t> temp_buffer;
     temp_buffer.resize(mini_batch_size * 4);
 
-    for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); ++i) {
+    for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) {
       const auto hardware_flags = hardware_flags_for_testing[i];
       if (use_32bit_hash) {
         if (!use_varlen_input) {
@@ -192,7 +196,7 @@ class TestVectorHash {
     // Verify that all implementations (scalar, SIMD) give the same hashes
     //
     const auto& hashes_scalar64 = hashes64[0];
-    for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); ++i) {
+    for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) {
       for (int j = 0; j < num_rows; ++j) {
         ASSERT_EQ(hashes64[i][j], hashes_scalar64[j])
             << "scalar and simd approaches yielded different hashes";
@@ -280,7 +284,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) {
   std::vector<uint32_t> temp_buffer;
   temp_buffer.resize(mini_batch_size * 4);
 
-  for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); ++i) {
+  for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) {
     const auto hardware_flags = hardware_flags_for_testing[i];
     Hashing32::HashFixed(hardware_flags,
                          /*combine_hashes=*/false, num_rows_to_hash, key_length,
@@ -292,7 +296,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) {
   }
 
   // Verify that all implementations (scalar, SIMD) give the same hashes.
-  for (int i = 1; i < static_cast<int>(hardware_flags_for_testing.size()); ++i) {
+  for (size_t i = 1; i < hardware_flags_for_testing.size(); ++i) {
     for (int j = 0; j < num_rows_to_hash; ++j) {
       ASSERT_EQ(hashes32[i][j], hashes32[0][j])
           << "scalar and simd approaches yielded different 32-bit hashes";
@@ -311,5 +315,52 @@ TEST(VectorHash, FixedLengthTailByteSafety) {
   HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63);
 }
 
+// Make sure that Hashing32/64::HashBatch uses no more stack space than declared in
+// Hashing32/64::kHashBatchTempStackUsage.
+TEST(VectorHash, HashBatchTempStackUsage) {
+  for (auto num_rows :
+       {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) {
+    SCOPED_TRACE("num_rows = " + std::to_string(num_rows));
+
+    MemoryPool* pool = default_memory_pool();
+    RandomArrayGenerator gen(42);
+
+    auto column = gen.Int8(num_rows, 0, 127);
+    ExecBatch batch({column}, num_rows);
+
+    std::vector<KeyColumnArray> column_arrays;
+    ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays));
+
+    const auto hardware_flags_for_testing = HardwareFlagsForTesting();
+    ASSERT_GT(hardware_flags_for_testing.size(), 0);
+
+    {
+      std::vector<uint32_t> hashes(num_rows);
+      TempVectorStack stack;
+      ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage));
+      for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) {
+        SCOPED_TRACE("hashing32 for hardware flags = " +
+                     std::to_string(hardware_flags_for_testing[i]));
+        ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays,
+                                       hardware_flags_for_testing[i], &stack,
+                                       /*start_rows=*/0, num_rows));
+      }
+    }
+
+    {
+      std::vector<uint64_t> hashes(num_rows);
+      TempVectorStack stack;
+      ASSERT_OK(stack.Init(pool, Hashing64::kHashBatchTempStackUsage));
+      for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) {
+        SCOPED_TRACE("hashing64 for hardware flags = " +
+                     std::to_string(hardware_flags_for_testing[i]));
+        ASSERT_OK(Hashing64::HashBatch(batch, hashes.data(), column_arrays,
+                                       hardware_flags_for_testing[i], &stack,
+                                       /*start_rows=*/0, num_rows));
+      }
+    }
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h
index 8e06dc83483aa..a5e784a9e4463 100644
--- a/cpp/src/arrow/compute/key_map_internal.h
+++ b/cpp/src/arrow/compute/key_map_internal.h
@@ -21,6 +21,7 @@
 #include <functional>
 
 #include "arrow/compute/util.h"
+#include "arrow/compute/util_internal.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index 67de71bf56c92..995c4211998e0 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -22,6 +22,7 @@
 #include "arrow/array.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/util.h"
+#include "arrow/compute/util_internal.h"
 #include "arrow/type.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index 08f36ee606025..cc02d489d138f 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -20,6 +20,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 
+#include "arrow/memory_pool.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h
index 16002ee5184e9..a5a109b0b516a 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -32,6 +32,16 @@ namespace compute {
 
 class ARROW_EXPORT KeyCompare {
  public:
+  // Clarify the max temp stack usage for CompareColumnsToRows, which might be necessary
+  // for the caller to be aware of (possibly at compile time) to reserve enough stack size
+  // in advance. The CompareColumnsToRows implementation uses three uint8 temp vectors as
+  // buffers for match vectors, all are of size num_rows. Plus extra kMiniBatchLength to
+  // cope with stack padding and aligning.
+  constexpr static int64_t CompareColumnsToRowsTempStackUsage(int64_t num_rows) {
+    return (sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint8_t)) * num_rows +
+           /*extra=*/util::MiniBatch::kMiniBatchLength;
+  }
+
   // Returns a single 16-bit selection vector of rows that failed comparison.
   // If there is input selection on the left, the resulting selection is a filtered image
   // of input selection.
diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc
index 1d8562cd56d3c..4044049b10863 100644
--- a/cpp/src/arrow/compute/row/compare_test.cc
+++ b/cpp/src/arrow/compute/row/compare_test.cc
@@ -19,23 +19,26 @@
 
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 
 namespace arrow {
 namespace compute {
 
 using arrow::bit_util::BytesForBits;
 using arrow::internal::CpuInfo;
+using arrow::random::RandomArrayGenerator;
 using arrow::util::MiniBatch;
 using arrow::util::TempVectorStack;
 
 // Specialized case for GH-39577.
 TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) {
   int fsb_length = 9;
+  int num_rows = 7;
+
   MemoryPool* pool = default_memory_pool();
   TempVectorStack stack;
-  ASSERT_OK(stack.Init(pool, 8 * MiniBatch::kMiniBatchLength * sizeof(uint64_t)));
+  ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));
 
-  int num_rows = 7;
   auto column_right = ArrayFromJSON(fixed_size_binary(fsb_length), R"([
       "000000000",
       "111111111",
@@ -106,5 +109,60 @@ TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) {
   }
 }
 
+// Make sure that KeyCompare::CompareColumnsToRows uses no more stack space than declared
+// in KeyCompare::CompareColumnsToRowsTempStackUsage().
+TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) {
+  for (auto num_rows :
+       {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) {
+    SCOPED_TRACE("num_rows = " + std::to_string(num_rows));
+
+    MemoryPool* pool = default_memory_pool();
+    TempVectorStack stack;
+    ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));
+
+    RandomArrayGenerator gen(42);
+
+    auto column_right = gen.Int8(num_rows, 0, 127);
+    ExecBatch batch_right({column_right}, num_rows);
+
+    std::vector<KeyColumnMetadata> column_metadatas_right;
+    ASSERT_OK(ColumnMetadatasFromExecBatch(batch_right, &column_metadatas_right));
+
+    RowTableMetadata table_metadata_right;
+    table_metadata_right.FromColumnMetadataVector(column_metadatas_right,
+                                                  sizeof(uint64_t), sizeof(uint64_t));
+
+    std::vector<KeyColumnArray> column_arrays_right;
+    ASSERT_OK(ColumnArraysFromExecBatch(batch_right, &column_arrays_right));
+
+    RowTableImpl row_table;
+    ASSERT_OK(row_table.Init(pool, table_metadata_right));
+
+    RowTableEncoder row_encoder;
+    row_encoder.Init(column_metadatas_right, sizeof(uint64_t), sizeof(uint64_t));
+    row_encoder.PrepareEncodeSelected(0, num_rows, column_arrays_right);
+
+    std::vector<uint16_t> row_ids_right(num_rows);
+    std::iota(row_ids_right.begin(), row_ids_right.end(), 0);
+    ASSERT_OK(row_encoder.EncodeSelected(&row_table, num_rows, row_ids_right.data()));
+
+    auto column_left = gen.Int8(num_rows, 0, 127);
+    ExecBatch batch_left({column_left}, num_rows);
+    std::vector<KeyColumnArray> column_arrays_left;
+    ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &column_arrays_left));
+
+    std::vector<uint32_t> row_ids_left(num_rows);
+    std::iota(row_ids_left.begin(), row_ids_left.end(), 0);
+
+    LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
+
+    uint32_t num_rows_no_match;
+    std::vector<uint16_t> row_ids_out(num_rows);
+    KeyCompare::CompareColumnsToRows(num_rows, NULLPTR, row_ids_left.data(), &ctx,
+                                     &num_rows_no_match, row_ids_out.data(),
+                                     column_arrays_left, row_table, true, NULLPTR);
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 50ca20bd14f31..3ed5411d0ba02 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -600,6 +600,7 @@ struct GrouperFastImpl : public Grouper {
   }
 
   Status Reset() override {
+    ARROW_DCHECK_EQ(temp_stack_.AllocatedSize(), 0);
     rows_.Clean();
     rows_minibatch_.Clean();
     map_.cleanup();
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
index b0c863b26a062..b90b3a64056bd 100644
--- a/cpp/src/arrow/compute/util.cc
+++ b/cpp/src/arrow/compute/util.cc
@@ -17,11 +17,7 @@
 
 #include "arrow/compute/util.h"
 
-#include "arrow/table.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/logging.h"
-#include "arrow/util/tracing_internal.h"
 #include "arrow/util/ubsan.h"
 
 namespace arrow {
@@ -31,33 +27,6 @@ using internal::CpuInfo;
 
 namespace util {
 
-void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) {
-  int64_t new_top = top_ + EstimatedAllocationSize(num_bytes);
-  // Stack overflow check (see GH-39582).
-  // XXX cannot return a regular Status because most consumers do not either.
-  ARROW_CHECK_LE(new_top, buffer_size_) << "TempVectorStack::alloc overflow";
-  *data = buffer_->mutable_data() + top_ + sizeof(uint64_t);
-  // We set 8 bytes before the beginning of the allocated range and
-  // 8 bytes after the end to check for stack overflow (which would
-  // result in those known bytes being corrupted).
-  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[0] = kGuard1;
-  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + new_top)[-1] = kGuard2;
-  *id = num_vectors_++;
-  top_ = new_top;
-}
-
-void TempVectorStack::release(int id, uint32_t num_bytes) {
-  ARROW_DCHECK(num_vectors_ == id + 1);
-  int64_t size = EstimatedAllocationSize(num_bytes);
-  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
-               kGuard2);
-  ARROW_DCHECK(top_ >= size);
-  top_ -= size;
-  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
-               kGuard1);
-  --num_vectors_;
-}
-
 namespace bit_util {
 
 inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 88dce160ce936..d56e398667f66 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -24,17 +24,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "arrow/buffer.h"
 #include "arrow/compute/expression.h"
 #include "arrow/compute/type_fwd.h"
-#include "arrow/memory_pool.h"
 #include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/bit_util.h"
 #include "arrow/util/cpu_info.h"
-#include "arrow/util/mutex.h"
-#include "arrow/util/thread_pool.h"
-#include "arrow/util/type_fwd.h"
 
 #if defined(__clang__) || defined(__GNUC__)
 #define BYTESWAP(x) __builtin_bswap64(x)
@@ -77,72 +70,6 @@ class MiniBatch {
   static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
 };
 
-/// Storage used to allocate temporary vectors of a batch size.
-/// Temporary vectors should resemble allocating temporary variables on the stack
-/// but in the context of vectorized processing where we need to store a vector of
-/// temporaries instead of a single value.
-class ARROW_EXPORT TempVectorStack {
-  template <typename>
-  friend class TempVectorHolder;
-
- public:
-  Status Init(MemoryPool* pool, int64_t size) {
-    num_vectors_ = 0;
-    top_ = 0;
-    buffer_size_ = EstimatedAllocationSize(size);
-    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
-    // Ensure later operations don't accidentally read uninitialized memory.
-    std::memset(buffer->mutable_data(), 0xFF, size);
-    buffer_ = std::move(buffer);
-    return Status::OK();
-  }
-
- private:
-  static int64_t EstimatedAllocationSize(int64_t size) {
-    return PaddedAllocationSize(size) + 2 * sizeof(uint64_t);
-  }
-
-  static int64_t PaddedAllocationSize(int64_t num_bytes) {
-    // Round up allocation size to multiple of 8 bytes
-    // to avoid returning temp vectors with unaligned address.
-    //
-    // Also add padding at the end to facilitate loads and stores
-    // using SIMD when number of vector elements is not divisible
-    // by the number of SIMD lanes.
-    //
-    return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
-  }
-  void alloc(uint32_t num_bytes, uint8_t** data, int* id);
-  void release(int id, uint32_t num_bytes);
-  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
-  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
-  static constexpr int64_t kPadding = 64;
-  int num_vectors_;
-  int64_t top_;
-  std::unique_ptr<Buffer> buffer_;
-  int64_t buffer_size_;
-};
-
-template <typename T>
-class TempVectorHolder {
-  friend class TempVectorStack;
-
- public:
-  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
-  T* mutable_data() { return reinterpret_cast<T*>(data_); }
-  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
-    stack_ = stack;
-    num_elements_ = num_elements;
-    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
-  }
-
- private:
-  TempVectorStack* stack_;
-  uint8_t* data_;
-  int id_;
-  uint32_t num_elements_;
-};
-
 namespace bit_util {
 
 ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
diff --git a/cpp/src/arrow/compute/util_internal.cc b/cpp/src/arrow/compute/util_internal.cc
new file mode 100644
index 0000000000000..cc26982fef110
--- /dev/null
+++ b/cpp/src/arrow/compute/util_internal.cc
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/util_internal.h"
+
+#include "arrow/compute/util.h"
+#include "arrow/memory_pool.h"
+
+namespace arrow {
+namespace util {
+
+Status TempVectorStack::Init(MemoryPool* pool, int64_t size) {
+  num_vectors_ = 0;
+  top_ = 0;
+  buffer_size_ = EstimatedAllocationSize(size);
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+  // Ensure later operations don't accidentally read uninitialized memory.
+  std::memset(buffer->mutable_data(), 0xFF, size);
+  buffer_ = std::move(buffer);
+  return Status::OK();
+}
+
+int64_t TempVectorStack::PaddedAllocationSize(int64_t num_bytes) {
+  // Round up allocation size to multiple of 8 bytes
+  // to avoid returning temp vectors with unaligned address.
+  //
+  // Also add padding at the end to facilitate loads and stores
+  // using SIMD when number of vector elements is not divisible
+  // by the number of SIMD lanes.
+  //
+  return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
+}
+
+void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) {
+  int64_t estimated_alloc_size = EstimatedAllocationSize(num_bytes);
+  int64_t new_top = top_ + estimated_alloc_size;
+  // Stack overflow check (see GH-39582).
+  // XXX cannot return a regular Status because most consumers do not either.
+  ARROW_CHECK_LE(new_top, buffer_size_)
+      << "TempVectorStack::alloc overflow: allocating " << estimated_alloc_size
+      << " on top of " << top_ << " in stack of size " << buffer_size_;
+  *data = buffer_->mutable_data() + top_ + sizeof(uint64_t);
+  // We set 8 bytes before the beginning of the allocated range and
+  // 8 bytes after the end to check for stack overflow (which would
+  // result in those known bytes being corrupted).
+  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[0] = kGuard1;
+  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + new_top)[-1] = kGuard2;
+  *id = num_vectors_++;
+  top_ = new_top;
+}
+
+void TempVectorStack::release(int id, uint32_t num_bytes) {
+  ARROW_DCHECK(num_vectors_ == id + 1);
+  int64_t size = EstimatedAllocationSize(num_bytes);
+  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
+               kGuard2);
+  ARROW_DCHECK(top_ >= size);
+  top_ -= size;
+  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
+               kGuard1);
+  --num_vectors_;
+}
+
+}  // namespace util
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h
index 87e89a3350721..043ff118062e4 100644
--- a/cpp/src/arrow/compute/util_internal.h
+++ b/cpp/src/arrow/compute/util_internal.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -27,5 +29,56 @@ void CheckAlignment(const void* ptr) {
   ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
 }
 
+/// Storage used to allocate temporary vectors of a batch size.
+/// Temporary vectors should resemble allocating temporary variables on the stack
+/// but in the context of vectorized processing where we need to store a vector of
+/// temporaries instead of a single value.
+class ARROW_EXPORT TempVectorStack {
+  template <typename>
+  friend class TempVectorHolder;
+
+ public:
+  Status Init(MemoryPool* pool, int64_t size);
+
+  int64_t AllocatedSize() const { return top_; }
+
+ private:
+  static int64_t EstimatedAllocationSize(int64_t size) {
+    return PaddedAllocationSize(size) + 2 * sizeof(uint64_t);
+  }
+
+  static int64_t PaddedAllocationSize(int64_t num_bytes);
+
+  void alloc(uint32_t num_bytes, uint8_t** data, int* id);
+  void release(int id, uint32_t num_bytes);
+  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
+  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
+  static constexpr int64_t kPadding = 64;
+  int num_vectors_;
+  int64_t top_;
+  std::unique_ptr<Buffer> buffer_;
+  int64_t buffer_size_;
+};
+
+template <typename T>
+class TempVectorHolder {
+  friend class TempVectorStack;
+
+ public:
+  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+  T* mutable_data() { return reinterpret_cast<T*>(data_); }
+  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+    stack_ = stack;
+    num_elements_ = num_elements;
+    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+  }
+
+ private:
+  TempVectorStack* stack_;
+  uint8_t* data_;
+  int id_;
+  uint32_t num_elements_;
+};
+
 }  // namespace util
 }  // namespace arrow