From 232024e38ba8b73c56a2d804afb8923e43a4cd84 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Mon, 15 Jan 2018 14:16:43 -0500 Subject: [PATCH 01/17] Update BinaryBuilder::Resize(int64_t capacity) in builder.cc When building BinaryArrays with a known size using Resize and Reserve methods, space is also reserved for value_data_builder_ to prevent internal reallocation --- cpp/src/arrow/builder.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index de132b5f6a0d1..51b94374c85fc 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1223,6 +1223,7 @@ Status BinaryBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(value_data_builder_.Resize(capacity)); return ArrayBuilder::Resize(capacity); } From 5b73c1c59971cfed3c8e2d97380e41eae988ec8c Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Mon, 15 Jan 2018 16:01:58 -0500 Subject: [PATCH 02/17] Update again BinaryBuilder::Resize(int64_t capacity) in builder.cc --- cpp/src/arrow/builder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 51b94374c85fc..dac02d14e719b 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1223,7 +1223,7 @@ Status BinaryBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); - RETURN_NOT_OK(value_data_builder_.Resize(capacity)); + RETURN_NOT_OK(value_data_builder_.Resize(capacity * sizeof(int64_t))); return ArrayBuilder::Resize(capacity); } From 5ebfb320cde650158e770ef3976f203facd538e0 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 16 Jan 2018 16:40:22 -0500 Subject: [PATCH 03/17] Add capacity() method for TypedBufferBuilder --- cpp/src/arrow/buffer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 450a4c78b5bbb..11cbea0802cef 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -314,6 +314,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { const T* data() const { return reinterpret_cast(data_); } int64_t length() const { return size_ / sizeof(T); } + int64_t capacity() const { return capacity_ / sizeof(T); } }; /// \brief Allocate a fixed size mutable buffer from a memory pool From e0434e61de1ff26df1856a57460cb95fb6863b25 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 16 Jan 2018 16:46:55 -0500 Subject: [PATCH 04/17] Add ReserveData(int64_t) and value_data_capacity() for methods for BinaryBuilder --- cpp/src/arrow/builder.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index ce7b8cd197da3..8f0daa67a8075 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,10 +682,13 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; + Status ReserveData(int64_t bytes) override; Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } /// Temporary access to a value. /// From de318f47e0f0995b425ca298585cbcba0b1ca7f3 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 16 Jan 2018 16:51:29 -0500 Subject: [PATCH 05/17] Implement ReserveData(int64_t) method for BinaryBuilder --- cpp/src/arrow/builder.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index dac02d14e719b..712e4194526df 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1222,10 +1222,14 @@ Status BinaryBuilder::Init(int64_t elements) { Status BinaryBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); - RETURN_NOT_OK(value_data_builder_.Resize(capacity * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } + +Status BinaryBuilder::ReserveData(int64_t capacity) { + DCHECK_LT(capacity, std::numeric_limits::max()); + return value_data_builder_.Resize(capacity * sizeof(int64_t)); +} Status BinaryBuilder::AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); From b002e0bd4835dd44fd836a4aa1fdaffa05e77b55 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 16 Jan 2018 17:02:01 -0500 Subject: [PATCH 06/17] Remove override keyword from ReserveData(int64_t) method for BinaryBuilder --- cpp/src/arrow/builder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 8f0daa67a8075..045334e5e4e2e 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,7 +682,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status ReserveData(int64_t bytes) override; + Status ReserveData(int64_t bytes); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far From 8dd5eaa9e44e64b9a98f34705d10d354c5f606bf Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Wed, 17 Jan 2018 15:56:16 -0500 Subject: [PATCH 07/17] Update builder.cc --- cpp/src/arrow/builder.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 712e4194526df..65bad76d7c51a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1227,8 +1227,11 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits::max()); - return value_data_builder_.Resize(capacity * sizeof(int64_t)); + if(value_data_length.length() + capacity > std::numeric_limits::max()) { + return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); + } + + return value_data_builder_.Resize(value_data_length.length() + capacity); } Status BinaryBuilder::AppendNextOffset() { From 9b5e805966f5a824193c6ab80da65ba5d1e76b26 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Wed, 17 Jan 2018 16:24:48 -0500 Subject: [PATCH 08/17] Update ReserveData(int64_t) method signature for BinaryBuilder Replace parameter name from "bytes" to "capacity" to avoid confusion. --- cpp/src/arrow/builder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 045334e5e4e2e..59293bff4b8c2 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,7 +682,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status ReserveData(int64_t bytes); + Status ReserveData(int64_t capacity); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far From 5a5593e5305ec3f5475bc96f00e89902d83c9766 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Wed, 17 Jan 2018 16:40:52 -0500 Subject: [PATCH 09/17] Update again ReserveData(int64_t) method for BinaryBuilder --- cpp/src/arrow/builder.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 65bad76d7c51a..a6be2e707a223 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1227,11 +1227,11 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t capacity) { - if(value_data_length.length() + capacity > std::numeric_limits::max()) { + if(value_data_length() + capacity > std::numeric_limits::max()) { return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); } - return value_data_builder_.Resize(value_data_length.length() + capacity); + return value_data_builder_.Resize(value_data_length() + capacity); } Status BinaryBuilder::AppendNextOffset() { From 15e045cc0d8c9aba4d967b9287378ecd0fedae9d Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Thu, 18 Jan 2018 11:44:50 -0500 Subject: [PATCH 10/17] Add test case for array-test.cc Add TestCapacityReserve to test space reservation for BinaryBuilder --- cpp/src/arrow/array-test.cc | 49 +++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7ff3261ecba5e..19f3e73e31775 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1154,6 +1154,55 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } } + +TEST_F(TestBinaryBuilder, TestCapacityReserve) { + vector strings = {"a", "bb", "cc", "ddddd", "eeeee"}; + int N = static_cast(strings.size()); + int length = 0 + int data_length = 0; + int capacity = N; + + ASSERT_OK(builder_->Reserve(capacity)); + ASSERT_OK(builder_->ReserveData(capacity)); + + ASSERT_EQ(static_cast(builder_->length()), length); + ASSERT_EQ(static_cast(builder_->capacity()), capacity); + ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); + ASSERT_EQ(static_cast(builder_->value_data_capacity()), capacity); + + for(const string& str : strings) { + ASSERT_OK(builder_->Append(str)); + + length++; + data_length += static_cast(str.size()); + + ASSERT_EQ(static_cast(builder_->length()), length); + ASSERT_EQ(static_cast(builder_->capacity()), capacity); + ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); + if(data_length <= capacity) { + ASSERT_EQ(static_cast(builder_->value_data_capacity()), capacity); + } else { + ASSERT_EQ(static_cast(builder_->value_data_capacity()), data_length); + } + } + + int extra_capacity = 10; + + ASSERT_OK(builder_->Reserve(extra_capacity)); + ASSERT_OK(builder_->ReserveData(extra_capacity)); + + ASSERT_EQ(static_cast(builder_->length()), length); + ASSERT_EQ(static_cast(builder_->capacity()), length + extra_capacity); + ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); + ASSERT_EQ(static_cast(builder_->value_data_capacity()), data_length + extra_capacity); + + Done(); + + ASSERT_EQ(N, result_->length()); + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(data_length, result_->value_data()->size()); + ASSERT_EQ(data_length + extra_capacity, result_->value_data()->capacity()); +} TEST_F(TestBinaryBuilder, TestZeroLength) { // All buffers are null From bbc6527065c0f43f82056ccc2a494e8a81a3c412 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Thu, 18 Jan 2018 17:14:23 -0500 Subject: [PATCH 11/17] ARROW-1945: [C++] Update test case for BinaryBuild data value space reservation --- cpp/src/arrow/array-test.cc | 47 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 19f3e73e31775..a77615ae0c956 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1157,51 +1157,50 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { TEST_F(TestBinaryBuilder, TestCapacityReserve) { vector strings = {"a", "bb", "cc", "ddddd", "eeeee"}; - int N = static_cast(strings.size()); - int length = 0 - int data_length = 0; - int capacity = N; + int64_t N = static_cast(strings.size()); + int64_t length = 0 + int64_t data_length = 0; + int64_t capacity = N; ASSERT_OK(builder_->Reserve(capacity)); ASSERT_OK(builder_->ReserveData(capacity)); - ASSERT_EQ(static_cast(builder_->length()), length); - ASSERT_EQ(static_cast(builder_->capacity()), capacity); - ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); - ASSERT_EQ(static_cast(builder_->value_data_capacity()), capacity); + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + ASSERT_EQ(builder_->value_data_capacity(), capacity); for(const string& str : strings) { ASSERT_OK(builder_->Append(str)); - length++; data_length += static_cast(str.size()); - ASSERT_EQ(static_cast(builder_->length()), length); - ASSERT_EQ(static_cast(builder_->capacity()), capacity); - ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); - if(data_length <= capacity) { - ASSERT_EQ(static_cast(builder_->value_data_capacity()), capacity); + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + if (data_length <= capacity) { + ASSERT_EQ(builder_->value_data_capacity(), capacity); } else { - ASSERT_EQ(static_cast(builder_->value_data_capacity()), data_length); + ASSERT_EQ(builder_->value_data_capacity(), data_length); } } - int extra_capacity = 10; + int extra_capacity = 20; ASSERT_OK(builder_->Reserve(extra_capacity)); ASSERT_OK(builder_->ReserveData(extra_capacity)); - ASSERT_EQ(static_cast(builder_->length()), length); - ASSERT_EQ(static_cast(builder_->capacity()), length + extra_capacity); - ASSERT_EQ(static_cast(builder_->value_data_length()), data_length); - ASSERT_EQ(static_cast(builder_->value_data_capacity()), data_length + extra_capacity); + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(length + extra_capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + ASSERT_EQ(builder_->value_data_capacity(), data_length + extra_capacity); Done(); - ASSERT_EQ(N, result_->length()); - ASSERT_EQ(0, result_->null_count()); - ASSERT_EQ(data_length, result_->value_data()->size()); - ASSERT_EQ(data_length + extra_capacity, result_->value_data()->capacity()); + ASSERT_EQ(result_->length(), N); + ASSERT_EQ(result_->null_count(), 0); + ASSERT_EQ(result_->value_data()->size(), data_length); + ASSERT_EQ(result_->value_data()->capacity(), BitUtil::RoundUpToMultipleOf64(data_length + extra_capacity)); } TEST_F(TestBinaryBuilder, TestZeroLength) { From 18f90fb80024a034205b06819b12dc8bacd26766 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Thu, 18 Jan 2018 17:56:47 -0500 Subject: [PATCH 12/17] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data data_capacity_ represents the indicated capacity for value_data_builder and it is always smaller than or equal to the actual capacity of underlying value_data_builder (data_capacity_ <= value_data_builder.capacity()). That's because when we say: ReserveData(capacity); The new capacity is max(data_capacity_, data length + capacity), and data_capacity_ is set to be equal to new capacity but underlying buffer size is set to BitUtil::RoundUpToMultipleOf64(new capacity) to ensure that the capacity of the buffer is a multiple of 64 bytes as defined in Layout.md. That's why data_capacity_ is needed to show the indicated capacity of the BinaryBuilder, just like ArrayBuilder::capacity_ indicates the indicated capacity of ArrayBuilder. A safety check is added in BinaryBuilder::Append() to update data_capacity_ if data length is greater than data_capacity_. The reason is that data_capacity is updated in ResearveData(). But if users make mistakes to append too much data, data length might be larger than data_capacity_ (data length <= actual capacity of underlying value_data_builder). If this happens data_capacity_ is set equal to data length to avoid confusion. --- cpp/src/arrow/builder.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index a6be2e707a223..63588867dfce0 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1208,7 +1208,7 @@ ArrayBuilder* ListBuilder::value_builder() const { // String and binary BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool), data_capacity_(0) {} BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} @@ -1227,11 +1227,15 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t capacity) { - if(value_data_length() + capacity > std::numeric_limits::max()) { + if (value_data_length() + capacity > data_capacity_) { + if (value_data_length() + capacity > std::numeric_limits::max()) { return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); - } - - return value_data_builder_.Resize(value_data_length() + capacity); + } + + RETURN_NOT_OK(value_data_builder_.Resize(value_data_length() + capacity)); + data_capacity_ = value_data_length() + capacity; + } + return Status::OK(); } Status BinaryBuilder::AppendNextOffset() { @@ -1249,6 +1253,9 @@ Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(Reserve(1)); RETURN_NOT_OK(AppendNextOffset()); RETURN_NOT_OK(value_data_builder_.Append(value, length)); + if (data_capacity_ < value_data_length()) { + data_capacity_ = value_data_length(); + } UnsafeAppendToBitmap(true); return Status::OK(); } From 0b078955eeb18e8b039c51bb25850f63b947b3cd Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Thu, 18 Jan 2018 18:04:23 -0500 Subject: [PATCH 13/17] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data --- cpp/src/arrow/builder.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 59293bff4b8c2..a708122c2930b 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -688,7 +688,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + int64_t value_data_capacity() const { return data_capacity_; } /// Temporary access to a value. /// @@ -699,6 +699,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; + int64_t data_capacity_; static constexpr int64_t kMaximumCapacity = std::numeric_limits::max() - 1; Status AppendNextOffset(); From d3c8202b5fe85a5cd1087ec84f85addac2003b1e Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Thu, 18 Jan 2018 18:08:50 -0500 Subject: [PATCH 14/17] ARROW-1945: [C++] Fix a small typo --- cpp/src/arrow/array-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index a77615ae0c956..743b40c79f1d1 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1158,7 +1158,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { TEST_F(TestBinaryBuilder, TestCapacityReserve) { vector strings = {"a", "bb", "cc", "ddddd", "eeeee"}; int64_t N = static_cast(strings.size()); - int64_t length = 0 + int64_t length = 0; int64_t data_length = 0; int64_t capacity = N; From bc5db7d39a38724cd723435e78b233e17719e0c3 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Fri, 19 Jan 2018 19:16:21 -0500 Subject: [PATCH 15/17] ARROW-1712: [C++] Remove unneeded data member in BinaryBuilder and modify test case --- cpp/src/arrow/array-test.cc | 55 ++++++++++++------------------------- cpp/src/arrow/builder.cc | 17 ++++-------- cpp/src/arrow/builder.h | 7 +++-- 3 files changed, 28 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 743b40c79f1d1..e350b842fb2e4 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1156,51 +1156,32 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } TEST_F(TestBinaryBuilder, TestCapacityReserve) { - vector strings = {"a", "bb", "cc", "ddddd", "eeeee"}; - int64_t N = static_cast(strings.size()); + vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddddddddddddd", "eeeeeeeeee"}; + int N = static_cast(strings.size()); + int reps = 10; int64_t length = 0; - int64_t data_length = 0; - int64_t capacity = N; - - ASSERT_OK(builder_->Reserve(capacity)); + int64_t capacity = 1000; + + ASSERT_OK(builder_->ReserveData(capacity)); - ASSERT_EQ(builder_->length(), length); - ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); - ASSERT_EQ(builder_->value_data_length(), data_length); - ASSERT_EQ(builder_->value_data_capacity(), capacity); + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()); - for(const string& str : strings) { - ASSERT_OK(builder_->Append(str)); - length++; - data_length += static_cast(str.size()); + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + ASSERT_OK(builder_->Append(strings[i])); + length += static_cast(strings[i].size()); - ASSERT_EQ(builder_->length(), length); - ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); - ASSERT_EQ(builder_->value_data_length(), data_length); - if (data_length <= capacity) { - ASSERT_EQ(builder_->value_data_capacity(), capacity); - } else { - ASSERT_EQ(builder_->value_data_capacity(), data_length); + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()); } } - - int extra_capacity = 20; - - ASSERT_OK(builder_->Reserve(extra_capacity)); - ASSERT_OK(builder_->ReserveData(extra_capacity)); - - ASSERT_EQ(builder_->length(), length); - ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(length + extra_capacity)); - ASSERT_EQ(builder_->value_data_length(), data_length); - ASSERT_EQ(builder_->value_data_capacity(), data_length + extra_capacity); - Done(); - - ASSERT_EQ(result_->length(), N); - ASSERT_EQ(result_->null_count(), 0); - ASSERT_EQ(result_->value_data()->size(), data_length); - ASSERT_EQ(result_->value_data()->capacity(), BitUtil::RoundUpToMultipleOf64(data_length + extra_capacity)); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(reps * 60, result_->value_data()->size()); + ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), result_->value_data()->capacity()); } TEST_F(TestBinaryBuilder, TestZeroLength) { diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 63588867dfce0..f85da527f1b19 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1208,7 +1208,7 @@ ArrayBuilder* ListBuilder::value_builder() const { // String and binary BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool), data_capacity_(0) {} + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} @@ -1226,14 +1226,12 @@ Status BinaryBuilder::Resize(int64_t capacity) { return ArrayBuilder::Resize(capacity); } -Status BinaryBuilder::ReserveData(int64_t capacity) { - if (value_data_length() + capacity > data_capacity_) { - if (value_data_length() + capacity > std::numeric_limits::max()) { +Status BinaryBuilder::ReserveData(int64_t elements) { + if (value_data_length() + elements > value_data_capacity()) { + if (value_data_length() + elements > std::numeric_limits::max()) { return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); - } - - RETURN_NOT_OK(value_data_builder_.Resize(value_data_length() + capacity)); - data_capacity_ = value_data_length() + capacity; + } + RETURN_NOT_OK(value_data_builder_.Reserve(elements)); } return Status::OK(); } @@ -1253,9 +1251,6 @@ Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(Reserve(1)); RETURN_NOT_OK(AppendNextOffset()); RETURN_NOT_OK(value_data_builder_.Append(value, length)); - if (data_capacity_ < value_data_length()) { - data_capacity_ = value_data_length(); - } UnsafeAppendToBitmap(true); return Status::OK(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index a708122c2930b..886fdc7226eb6 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,13 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status ReserveData(int64_t capacity); + /// Ensures there is enough space for adding the number of value elements + /// by checking value buffer capacity and resizing if necessary. + Status ReserveData(int64_t elements); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } /// \return capacity of values buffer - int64_t value_data_capacity() const { return data_capacity_; } + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } /// Temporary access to a value. /// @@ -699,7 +701,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - int64_t data_capacity_; static constexpr int64_t kMaximumCapacity = std::numeric_limits::max() - 1; Status AppendNextOffset(); From d4bbd151c7628d8c6dfa51e7a243765fd4a10961 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Mon, 22 Jan 2018 16:29:43 -0500 Subject: [PATCH 16/17] ARROW-1712: [C++] Modify test case for BinaryBuilder::ReserveData() and change arguments for offsets_builder_.Resize() --- cpp/src/arrow/array-test.cc | 10 ++++++++-- cpp/src/arrow/builder.cc | 6 +++--- cpp/src/arrow/builder.h | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index e350b842fb2e4..a2be323a13385 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1162,7 +1162,6 @@ TEST_F(TestBinaryBuilder, TestCapacityReserve) { int64_t length = 0; int64_t capacity = 1000; - ASSERT_OK(builder_->ReserveData(capacity)); ASSERT_EQ(length, builder_->value_data_length()); @@ -1177,11 +1176,18 @@ TEST_F(TestBinaryBuilder, TestCapacityReserve) { ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()); } } + + int extra_capacity = 500; + ASSERT_OK(builder_->ReserveData(extra_capacity)); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(length + extra_capacity), builder_->value_data_capacity()); + Done(); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(0, result_->null_count()); ASSERT_EQ(reps * 60, result_->value_data()->size()); - ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), result_->value_data()->capacity()); + ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(length + extra_capacity), result_->value_data()->capacity()); } TEST_F(TestBinaryBuilder, TestZeroLength) { diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f85da527f1b19..e5765a028ad5a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status ListBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } @@ -1216,7 +1216,7 @@ Status BinaryBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status BinaryBuilder::Resize(int64_t capacity) { diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 886fdc7226eb6..49c43b27ef6a4 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,8 +682,8 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - /// Ensures there is enough space for adding the number of value elements - /// by checking value buffer capacity and resizing if necessary. + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements); Status FinishInternal(std::shared_ptr* out) override; From 707b67bf7c11f67c25d65945d7e14aafefef57fc Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 23 Jan 2018 13:53:08 -0500 Subject: [PATCH 17/17] ARROW-1712: [C++] Fix lint errors --- cpp/src/arrow/array-test.cc | 30 +++++++++++++++++------------- cpp/src/arrow/builder.cc | 6 +++--- cpp/src/arrow/builder.h | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index a2be323a13385..c53da8591e94e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1154,40 +1154,44 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } } - + TEST_F(TestBinaryBuilder, TestCapacityReserve) { - vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddddddddddddd", "eeeeeeeeee"}; + vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"}; int N = static_cast(strings.size()); - int reps = 10; + int reps = 15; int64_t length = 0; int64_t capacity = 1000; - + int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); + ASSERT_OK(builder_->ReserveData(capacity)); - + ASSERT_EQ(length, builder_->value_data_length()); - ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()); - + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + for (int j = 0; j < reps; ++j) { for (int i = 0; i < N; ++i) { ASSERT_OK(builder_->Append(strings[i])); length += static_cast(strings[i].size()); - + ASSERT_EQ(length, builder_->value_data_length()); - ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); } } - + int extra_capacity = 500; + expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + ASSERT_OK(builder_->ReserveData(extra_capacity)); ASSERT_EQ(length, builder_->value_data_length()); - ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(length + extra_capacity), builder_->value_data_capacity()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); Done(); + ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(0, result_->null_count()); - ASSERT_EQ(reps * 60, result_->value_data()->size()); - ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(length + extra_capacity), result_->value_data()->capacity()); + ASSERT_EQ(reps * 40, result_->value_data()->size()); + ASSERT_EQ(expected_capacity, result_->value_data()->capacity()); } TEST_F(TestBinaryBuilder, TestZeroLength) { diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index e5765a028ad5a..db901526fc2ee 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1225,14 +1225,14 @@ Status BinaryBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } - + Status BinaryBuilder::ReserveData(int64_t elements) { if (value_data_length() + elements > value_data_capacity()) { if (value_data_length() + elements > std::numeric_limits::max()) { - return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); + return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary"); } RETURN_NOT_OK(value_data_builder_.Reserve(elements)); - } + } return Status::OK(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 49c43b27ef6a4..d1611f60cd924 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,7 +682,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - /// \brief Ensures there is enough allocated capacity to append the indicated + /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements); Status FinishInternal(std::shared_ptr* out) override;