From 676c32ccea6274c75b2750453c1ddbc5f645c037 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2016 21:18:30 -0400 Subject: [PATCH] ARROW-317: Add Slice, Copy methods to Buffer There's also a little bit of naming cleanup in `bit-util.h`, pardon the diff noise. Author: Wes McKinney Closes #177 from wesm/ARROW-317 and squashes the following commits: 0666b22 [Wes McKinney] Fix up pyarrow usage of BitUtil 3ab4e7a [Wes McKinney] Add Slice, Copy methods to Buffer cb9519d [Wes McKinney] Use more conforming names in bit-util.h --- cpp/src/arrow/array.cc | 3 +- cpp/src/arrow/array.h | 2 +- cpp/src/arrow/builder.cc | 12 ++++---- cpp/src/arrow/column-benchmark.cc | 2 +- cpp/src/arrow/ipc/adapter.cc | 5 ++-- cpp/src/arrow/ipc/test-common.h | 3 +- cpp/src/arrow/test-util.h | 6 ++-- cpp/src/arrow/types/list.cc | 2 +- cpp/src/arrow/types/primitive-test.cc | 16 +++++------ cpp/src/arrow/types/primitive.cc | 13 +++++---- cpp/src/arrow/types/primitive.h | 12 ++++---- cpp/src/arrow/util/bit-util-test.cc | 36 +++++++++++------------ cpp/src/arrow/util/bit-util.cc | 10 +++---- cpp/src/arrow/util/bit-util.h | 29 +++++++++---------- cpp/src/arrow/util/buffer-test.cc | 41 +++++++++++++++++++++++++++ cpp/src/arrow/util/buffer.cc | 28 +++++++++++++++++- cpp/src/arrow/util/buffer.h | 23 +++++++++++---- python/src/pyarrow/adapters/pandas.cc | 20 ++++++------- 18 files changed, 173 insertions(+), 90 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d6b081f315532..e432a53781f17 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -19,6 +19,7 @@ #include +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -43,7 +44,7 @@ bool Array::EqualsExact(const Array& other) const { return false; } if (null_count_ > 0) { - return null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + return null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); } return true; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c7ffb23ca18a1..ff37323f60519 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -45,7 +45,7 @@ class ARROW_EXPORT Array { // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && util::bit_not_set(null_bitmap_data_, i); + return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i); } int32_t length() const { return length_; } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1fba96169228f..151b257a3d894 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -31,7 +31,7 @@ Status ArrayBuilder::AppendToBitmap(bool is_valid) { // TODO(emkornfield) doubling isn't great default allocation practice // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md // fo discussion - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); } UnsafeAppendToBitmap(is_valid); return Status::OK(); @@ -45,7 +45,7 @@ Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) } Status ArrayBuilder::Init(int32_t capacity) { - int32_t to_alloc = util::ceil_byte(capacity) / 8; + int32_t to_alloc = BitUtil::CeilByte(capacity) / 8; null_bitmap_ = std::make_shared(pool_); RETURN_NOT_OK(null_bitmap_->Resize(to_alloc)); // Buffers might allocate more then necessary to satisfy padding requirements @@ -58,7 +58,7 @@ Status ArrayBuilder::Init(int32_t capacity) { Status ArrayBuilder::Resize(int32_t new_bits) { if (!null_bitmap_) { return Init(new_bits); } - int32_t new_bytes = util::ceil_byte(new_bits) / 8; + int32_t new_bytes = BitUtil::CeilByte(new_bits) / 8; int32_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); null_bitmap_data_ = null_bitmap_->mutable_data(); @@ -82,7 +82,7 @@ Status ArrayBuilder::Advance(int32_t elements) { Status ArrayBuilder::Reserve(int32_t elements) { if (length_ + elements > capacity_) { // TODO(emkornfield) power of 2 growth is potentially suboptimal - int32_t new_capacity = util::next_power2(length_ + elements); + int32_t new_capacity = BitUtil::NextPower2(length_ + elements); return Resize(new_capacity); } return Status::OK(); @@ -96,7 +96,7 @@ Status ArrayBuilder::SetNotNull(int32_t length) { void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) { if (is_valid) { - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); } else { ++null_count_; } @@ -118,7 +118,7 @@ void ArrayBuilder::UnsafeSetNotNull(int32_t length) { const int32_t new_length = length + length_; // TODO(emkornfield) Optimize for large values of length? for (int32_t i = length_; i < new_length; ++i) { - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(null_bitmap_data_, i); } length_ = new_length; } diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index edea0948860de..f429a813c6f20 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -29,7 +29,7 @@ std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { auto data = std::make_shared(pool); auto null_bitmap = std::make_shared(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); - null_bitmap->Resize(util::bytes_for_bits(length)); + null_bitmap->Resize(BitUtil::BytesForBits(length)); return std::make_shared(length, data, 10, null_bitmap); } } // anonymous namespace diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index f84cb264f70e1..74786bf85ffb4 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -37,6 +37,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -49,7 +50,7 @@ namespace ipc { namespace { Status CheckMultipleOf64(int64_t size) { - if (util::is_multiple_of_64(size)) { return Status::OK(); } + if (BitUtil::IsMultipleOf64(size)) { return Status::OK(); } return Status::Invalid( "Attempted to write a buffer that " "wasn't a multiple of 64 bytes"); @@ -155,7 +156,7 @@ class RecordBatchWriter { // The buffer might be null if we are handling zero row lengths. if (buffer) { size = buffer->size(); - padding = util::RoundUpToMultipleOf64(size) - size; + padding = BitUtil::RoundUpToMultipleOf64(size) - size; } // TODO(wesm): We currently have no notion of shared memory page id's, diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 13bbbebde8aa1..784e238e977c7 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -31,6 +31,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" @@ -263,7 +264,7 @@ Status MakeStruct(std::shared_ptr* out) { std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; std::shared_ptr null_bitmask; - RETURN_NOT_OK(util::bytes_to_bits(null_bytes, &null_bitmask)); + RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, &null_bitmask)); ArrayPtr with_nulls( new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index e632ffb1d892d..ac56f5ed0871c 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -69,7 +69,7 @@ class TestBase : public ::testing::Test { auto data = std::make_shared(pool_); auto null_bitmap = std::make_shared(pool_); EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(null_bitmap->Resize(util::bytes_for_bits(length))); + EXPECT_OK(null_bitmap->Resize(BitUtil::BytesForBits(length))); return std::make_shared(length, data, 10, null_bitmap); } @@ -152,7 +152,7 @@ static inline int bitmap_popcount(const uint8_t* data, int length) { // versions of popcount but the code complexity is likely not worth it) const int loop_tail_index = fast_counts * pop_len; for (int i = loop_tail_index; i < length; ++i) { - if (util::get_bit(data, i)) { ++count; } + if (BitUtil::GetBit(data, i)) { ++count; } } return count; @@ -170,7 +170,7 @@ std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) std::shared_ptr out; // TODO(wesm): error checking - util::bytes_to_bits(bytes, &out); + BitUtil::BytesToBits(bytes, &out); return out; } diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index ef2ec22cb5336..4b1e821472795 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -30,7 +30,7 @@ bool ListArray::EqualsExact(const ListArray& other) const { bool equal_null_bitmap = true; if (null_count_ > 0) { equal_null_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); } if (!equal_null_bitmap) { return false; } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 121bd4794f291..e47f6dc74fb7e 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -236,7 +236,7 @@ void TestPrimitiveBuilder::Check( for (int i = 0; i < result->length(); ++i) { if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } - bool actual = util::get_bit(result->raw_data(), i); + bool actual = BitUtil::GetBit(result->raw_data(), i); ASSERT_EQ(static_cast(draws_[i]), actual) << i; } ASSERT_TRUE(result->EqualsExact(*expected.get())); @@ -258,8 +258,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { int n = 1000; ASSERT_OK(this->builder_->Reserve(n)); - ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); - ASSERT_EQ(util::next_power2(TypeTraits::bytes_required(n)), + ASSERT_EQ(BitUtil::NextPower2(n), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(TypeTraits::bytes_required(n)), this->builder_->data()->size()); // unsure if this should go in all builder classes @@ -409,10 +409,10 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { } ASSERT_EQ(size, this->builder_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); ASSERT_EQ(size, this->builder_nn_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_nn_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_nn_->capacity()); this->Check(this->builder_, true); this->Check(this->builder_nn_, false); @@ -444,7 +444,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); ASSERT_EQ(size, this->builder_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); this->Check(this->builder_, true); this->Check(this->builder_nn_, false); @@ -472,7 +472,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_EQ(cap, this->builder_->capacity()); ASSERT_EQ(TypeTraits::bytes_required(cap), this->builder_->data()->size()); - ASSERT_EQ(util::bytes_for_bits(cap), this->builder_->null_bitmap()->size()); + ASSERT_EQ(BitUtil::BytesForBits(cap), this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { @@ -484,7 +484,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(kMinBuilderCapacity)); - ASSERT_EQ(util::next_power2(kMinBuilderCapacity + 100), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); } } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 3a05ccfdf1861..d2288bafa71da 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -19,6 +19,7 @@ #include +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" @@ -41,7 +42,7 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { if (null_count_ > 0) { bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::ceil_byte(length_) / 8); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8); if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; @@ -156,9 +157,9 @@ Status PrimitiveBuilder::Append( if ((valid_bytes != nullptr) && !valid_bytes[i]) continue; if (values[i] > 0) { - util::set_bit(raw_data_, length_ + i); + BitUtil::SetBit(raw_data_, length_ + i); } else { - util::clear_bit(raw_data_, length_ + i); + BitUtil::ClearBit(raw_data_, length_ + i); } } @@ -196,20 +197,20 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const { if (null_count_ > 0) { bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && util::get_bit(this_data, i) != util::get_bit(other_data, i)) { + if (!IsNull(i) && BitUtil::GetBit(this_data, i) != BitUtil::GetBit(other_data, i)) { return false; } } return true; } else { - return data_->Equals(*other.data_, util::bytes_for_bits(length_)); + return data_->Equals(*other.data_, BitUtil::BytesForBits(length_)); } } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index f21470d96e45b..c71df584ffe3f 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -173,7 +173,7 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { // Does not capacity-check; make sure to call Reserve beforehand void UnsafeAppend(value_type val) { - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); raw_data_[length_++] = val; } @@ -290,7 +290,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } - bool Value(int i) const { return util::get_bit(raw_data(), i); } + bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } }; template <> @@ -298,7 +298,7 @@ struct TypeTraits { typedef BooleanArray ArrayType; static inline int bytes_required(int elements) { - return util::bytes_for_bits(elements); + return BitUtil::BytesForBits(elements); } }; @@ -314,11 +314,11 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { // Scalar append Status Append(bool val) { Reserve(1); - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); if (val) { - util::set_bit(raw_data_, length_); + BitUtil::SetBit(raw_data_, length_); } else { - util::clear_bit(raw_data_, length_); + BitUtil::ClearBit(raw_data_, length_); } ++length_; return Status::OK(); diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index e1d8a0808b41a..cfdee04f6e2ea 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -22,33 +22,33 @@ namespace arrow { TEST(UtilTests, TestIsMultipleOf64) { - using util::is_multiple_of_64; - EXPECT_TRUE(is_multiple_of_64(64)); - EXPECT_TRUE(is_multiple_of_64(0)); - EXPECT_TRUE(is_multiple_of_64(128)); - EXPECT_TRUE(is_multiple_of_64(192)); - EXPECT_FALSE(is_multiple_of_64(23)); - EXPECT_FALSE(is_multiple_of_64(32)); + using BitUtil::IsMultipleOf64; + EXPECT_TRUE(IsMultipleOf64(64)); + EXPECT_TRUE(IsMultipleOf64(0)); + EXPECT_TRUE(IsMultipleOf64(128)); + EXPECT_TRUE(IsMultipleOf64(192)); + EXPECT_FALSE(IsMultipleOf64(23)); + EXPECT_FALSE(IsMultipleOf64(32)); } TEST(UtilTests, TestNextPower2) { - using util::next_power2; + using BitUtil::NextPower2; - ASSERT_EQ(8, next_power2(6)); - ASSERT_EQ(8, next_power2(8)); + ASSERT_EQ(8, NextPower2(6)); + ASSERT_EQ(8, NextPower2(8)); - ASSERT_EQ(1, next_power2(1)); - ASSERT_EQ(256, next_power2(131)); + ASSERT_EQ(1, NextPower2(1)); + ASSERT_EQ(256, NextPower2(131)); - ASSERT_EQ(1024, next_power2(1000)); + ASSERT_EQ(1024, NextPower2(1000)); - ASSERT_EQ(4096, next_power2(4000)); + ASSERT_EQ(4096, NextPower2(4000)); - ASSERT_EQ(65536, next_power2(64000)); + ASSERT_EQ(65536, NextPower2(64000)); - ASSERT_EQ(1LL << 32, next_power2((1LL << 32) - 1)); - ASSERT_EQ(1LL << 31, next_power2((1LL << 31) - 1)); - ASSERT_EQ(1LL << 62, next_power2((1LL << 62) - 1)); + ASSERT_EQ(1LL << 32, NextPower2((1LL << 32) - 1)); + ASSERT_EQ(1LL << 31, NextPower2((1LL << 31) - 1)); + ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 475576e87cadd..7e1cb1867171e 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -24,20 +24,20 @@ namespace arrow { -void util::bytes_to_bits(const std::vector& bytes, uint8_t* bits) { +void BitUtil::BytesToBits(const std::vector& bytes, uint8_t* bits) { for (size_t i = 0; i < bytes.size(); ++i) { - if (bytes[i] > 0) { set_bit(bits, i); } + if (bytes[i] > 0) { SetBit(bits, i); } } } -Status util::bytes_to_bits( +Status BitUtil::BytesToBits( const std::vector& bytes, std::shared_ptr* out) { - int bit_length = util::bytes_for_bits(bytes.size()); + int bit_length = BitUtil::BytesForBits(bytes.size()); auto buffer = std::make_shared(); RETURN_NOT_OK(buffer->Resize(bit_length)); memset(buffer->mutable_data(), 0, bit_length); - bytes_to_bits(bytes, buffer->mutable_data()); + BytesToBits(bytes, buffer->mutable_data()); *out = buffer; return Status::OK(); diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index c33ef272f05e2..13b7e19593d93 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -30,39 +30,39 @@ namespace arrow { class Buffer; class Status; -namespace util { +namespace BitUtil { -static inline int64_t ceil_byte(int64_t size) { +static inline int64_t CeilByte(int64_t size) { return (size + 7) & ~7; } -static inline int64_t bytes_for_bits(int64_t size) { - return ceil_byte(size) / 8; +static inline int64_t BytesForBits(int64_t size) { + return CeilByte(size) / 8; } -static inline int64_t ceil_2bytes(int64_t size) { +static inline int64_t Ceil2Bytes(int64_t size) { return (size + 15) & ~15; } static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; -static inline bool get_bit(const uint8_t* bits, int i) { +static inline bool GetBit(const uint8_t* bits, int i) { return static_cast(bits[i / 8] & kBitmask[i % 8]); } -static inline bool bit_not_set(const uint8_t* bits, int i) { +static inline bool BitNotSet(const uint8_t* bits, int i) { return (bits[i / 8] & kBitmask[i % 8]) == 0; } -static inline void clear_bit(uint8_t* bits, int i) { +static inline void ClearBit(uint8_t* bits, int i) { bits[i / 8] &= ~kBitmask[i % 8]; } -static inline void set_bit(uint8_t* bits, int i) { +static inline void SetBit(uint8_t* bits, int i) { bits[i / 8] |= kBitmask[i % 8]; } -static inline int64_t next_power2(int64_t n) { +static inline int64_t NextPower2(int64_t n) { n--; n |= n >> 1; n |= n >> 2; @@ -74,7 +74,7 @@ static inline int64_t next_power2(int64_t n) { return n; } -static inline bool is_multiple_of_64(int64_t n) { +static inline bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } @@ -90,11 +90,10 @@ inline int64_t RoundUpToMultipleOf64(int64_t num) { return num; } -void bytes_to_bits(const std::vector& bytes, uint8_t* bits); -ARROW_EXPORT Status bytes_to_bits(const std::vector&, std::shared_ptr*); - -} // namespace util +void BytesToBits(const std::vector& bytes, uint8_t* bits); +ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr*); +} // namespace BitUtil } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index cc4ec98e4fb29..095b07b7ab309 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -31,6 +31,18 @@ namespace arrow { class TestBuffer : public ::testing::Test {}; +TEST_F(TestBuffer, IsMutableFlag) { + Buffer buf(nullptr, 0); + + ASSERT_FALSE(buf.is_mutable()); + + MutableBuffer mbuf(nullptr, 0); + ASSERT_TRUE(mbuf.is_mutable()); + + PoolBuffer pbuf; + ASSERT_TRUE(pbuf.is_mutable()); +} + TEST_F(TestBuffer, Resize) { PoolBuffer buf; @@ -96,4 +108,33 @@ TEST_F(TestBuffer, EqualsWithSameBuffer) { pool->Free(rawBuffer, bufferSize); } +TEST_F(TestBuffer, Copy) { + std::string data_str = "some data to copy"; + + auto data = reinterpret_cast(data_str.c_str()); + + Buffer buf(data, data_str.size()); + + std::shared_ptr out; + + ASSERT_OK(buf.Copy(5, 4, &out)); + + Buffer expected(data + 5, 4); + ASSERT_TRUE(out->Equals(expected)); +} + +TEST_F(TestBuffer, SliceBuffer) { + std::string data_str = "some data to slice"; + + auto data = reinterpret_cast(data_str.c_str()); + + auto buf = std::make_shared(data, data_str.size()); + + std::shared_ptr out = SliceBuffer(buf, 5, 4); + Buffer expected(data + 5, 4); + ASSERT_TRUE(out->Equals(expected)); + + ASSERT_EQ(2, buf.use_count()); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 6faa048e4e52b..a230259e5930d 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -36,6 +36,32 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t si Buffer::~Buffer() {} +Status Buffer::Copy( + int64_t start, int64_t nbytes, MemoryPool* pool, std::shared_ptr* out) const { + // Sanity checks + DCHECK_LT(start, size_); + DCHECK_LE(nbytes, size_ - start); + + auto new_buffer = std::make_shared(pool); + RETURN_NOT_OK(new_buffer->Resize(nbytes)); + + std::memcpy(new_buffer->mutable_data(), data() + start, nbytes); + + *out = new_buffer; + return Status::OK(); +} + +Status Buffer::Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) const { + return Copy(start, nbytes, default_memory_pool(), out); +} + +std::shared_ptr SliceBuffer( + const std::shared_ptr& buffer, int64_t offset, int64_t length) { + DCHECK_LT(offset, buffer->size()); + DCHECK_LE(length, buffer->size() - offset); + return std::make_shared(buffer, offset, length); +} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -52,7 +78,7 @@ PoolBuffer::~PoolBuffer() { Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; - new_capacity = util::RoundUpToMultipleOf64(new_capacity); + new_capacity = BitUtil::RoundUpToMultipleOf64(new_capacity); if (mutable_data_) { RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); memcpy(new_data, mutable_data_, size_); diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index bc0df86221c45..04ad6c2dffde4 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -43,7 +43,8 @@ class Status; // The following invariant is always true: Size < Capacity class ARROW_EXPORT Buffer : public std::enable_shared_from_this { public: - Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size), capacity_(size) {} + Buffer(const uint8_t* data, int64_t size) + : is_mutable_(false), data_(data), size_(size), capacity_(size) {} virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be @@ -57,6 +58,8 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { std::shared_ptr get_shared_ptr() { return shared_from_this(); } + bool is_mutable() const { return is_mutable_; } + // Return true if both buffers are the same size and contain the same bytes // up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const { @@ -71,18 +74,22 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { (data_ == other.data_ || !memcmp(data_, other.data_, size_))); } + // Copy section of buffer into a new Buffer + Status Copy(int64_t start, int64_t nbytes, MemoryPool* pool, + std::shared_ptr* out) const; + + // Default memory pool + Status Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) const; + int64_t capacity() const { return capacity_; } const uint8_t* data() const { return data_; } int64_t size() const { return size_; } - // Returns true if this Buffer is referencing memory (possibly) owned by some - // other buffer - bool is_shared() const { return static_cast(parent_); } - const std::shared_ptr parent() const { return parent_; } protected: + bool is_mutable_; const uint8_t* data_; int64_t size_; int64_t capacity_; @@ -94,10 +101,16 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { DISALLOW_COPY_AND_ASSIGN(Buffer); }; +// Construct a view on passed buffer at the indicated offset and length. This +// function cannot fail and does not error checking (except in debug builds) +std::shared_ptr SliceBuffer( + const std::shared_ptr& buffer, int64_t offset, int64_t length); + // A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { + is_mutable_ = true; mutable_data_ = data; } diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 5902b8341696d..7e70be75da5fc 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -44,7 +44,7 @@ using arrow::Field; using arrow::DataType; using arrow::Status; -namespace util = arrow::util; +namespace BitUtil = arrow::BitUtil; // ---------------------------------------------------------------------- // Serialization @@ -148,7 +148,7 @@ class ArrowSerializer { } Status InitNullBitmap() { - int null_bytes = util::bytes_for_bits(length_); + int null_bytes = BitUtil::BytesForBits(length_); null_bitmap_ = std::make_shared(pool_); RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); @@ -206,7 +206,7 @@ class ArrowSerializer { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - int nbytes = util::bytes_for_bits(length_); + int nbytes = BitUtil::BytesForBits(length_); auto data = std::make_shared(pool_); RETURN_NOT_OK(data->Resize(nbytes)); uint8_t* bitmap = data->mutable_data(); @@ -215,12 +215,12 @@ class ArrowSerializer { int64_t null_count = 0; for (int64_t i = 0; i < length_; ++i) { if (objects[i] == Py_True) { - util::set_bit(bitmap, i); - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(bitmap, i); + BitUtil::SetBit(null_bitmap_data_, i); } else if (objects[i] != Py_False) { ++null_count; } else { - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(null_bitmap_data_, i); } } @@ -253,7 +253,7 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap if (mask_values[i]) { ++null_count; } else { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } } return null_count; @@ -272,7 +272,7 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) if (traits::isnull(values[i])) { ++null_count; } else { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } } @@ -402,7 +402,7 @@ inline Status ArrowSerializer::ConvertData() { return Status::Invalid("no support for strided data yet"); } - int nbytes = util::bytes_for_bits(length_); + int nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); RETURN_NOT_OK(buffer->Resize(nbytes)); @@ -413,7 +413,7 @@ inline Status ArrowSerializer::ConvertData() { memset(bitmap, 0, nbytes); for (int i = 0; i < length_; ++i) { if (values[i] > 0) { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } }