From 98be016303241c62b52a094c09df90cdd4ed8196 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 2 Mar 2016 17:01:07 -0800 Subject: [PATCH] ARROW-20: Add null_count_ member to Array containers, remove nullable member --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/array-test.cc | 57 ++++++++-------- cpp/src/arrow/array.cc | 11 ++-- cpp/src/arrow/array.h | 37 +++++++---- cpp/src/arrow/builder.cc | 35 +++++----- cpp/src/arrow/builder.h | 29 ++++---- cpp/src/arrow/test-util.h | 10 +++ cpp/src/arrow/type.h | 12 ++-- cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/datetime.h | 12 ++-- cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 12 +--- cpp/src/arrow/types/list.h | 46 ++++++------- cpp/src/arrow/types/primitive-test.cc | 34 +++++----- cpp/src/arrow/types/primitive.cc | 11 ++-- cpp/src/arrow/types/primitive.h | 95 +++++++++++++++------------ cpp/src/arrow/types/string-test.cc | 31 ++++----- cpp/src/arrow/types/string.cc | 2 +- cpp/src/arrow/types/string.h | 43 ++++++------ cpp/src/arrow/types/struct-test.cc | 6 +- cpp/src/arrow/types/struct.h | 5 +- cpp/src/arrow/types/test-common.h | 4 +- cpp/src/arrow/types/union.h | 10 ++- cpp/src/arrow/util/bit-util.cc | 4 +- cpp/src/arrow/util/bit-util.h | 4 +- 25 files changed, 265 insertions(+), 253 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d2c840abfe823..f0eb73dc41371 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,7 +92,7 @@ endif() # For CMAKE_BUILD_TYPE=Release # -O3: Enable all compiler optimizations # -g: Enable symbols for profiler tools (TODO: remove for shipping) -set(CXX_FLAGS_DEBUG "-ggdb") +set(CXX_FLAGS_DEBUG "-ggdb -O0") set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 16afb9bef348c..df827aaa113aa 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "arrow/array.h" @@ -32,60 +31,60 @@ #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" -using std::string; -using std::vector; - namespace arrow { static TypePtr int32 = TypePtr(new Int32Type()); -static TypePtr int32_nn = TypePtr(new Int32Type(false)); - class TestArray : public ::testing::Test { public: void SetUp() { pool_ = GetDefaultMemoryPool(); - - auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); - - ASSERT_OK(data->Resize(400)); - ASSERT_OK(nulls->Resize(128)); - - arr_.reset(new Int32Array(100, data, nulls)); } protected: MemoryPool* pool_; - std::unique_ptr arr_; }; -TEST_F(TestArray, TestNullable) { - std::shared_ptr tmp = arr_->data(); - std::unique_ptr arr_nn(new Int32Array(100, tmp)); +TEST_F(TestArray, TestNullCount) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); - ASSERT_TRUE(arr_->nullable()); - ASSERT_FALSE(arr_nn->nullable()); + std::unique_ptr arr(new Int32Array(100, data, 10, nulls)); + ASSERT_EQ(10, arr->null_count()); + + std::unique_ptr arr_no_nulls(new Int32Array(100, data)); + ASSERT_EQ(0, arr_no_nulls->null_count()); } TEST_F(TestArray, TestLength) { - ASSERT_EQ(arr_->length(), 100); + auto data = std::make_shared(pool_); + std::unique_ptr arr(new Int32Array(100, data)); + ASSERT_EQ(arr->length(), 100); } TEST_F(TestArray, TestIsNull) { - vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 0, 1}; + std::vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; + int32_t null_count = 0; + for (uint8_t x : nulls) { + if (x > 0) ++null_count; + } - std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), nulls.size()); + std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), + nulls.size()); std::unique_ptr arr; - arr.reset(new Array(int32, nulls.size(), null_buf)); + arr.reset(new Array(int32, nulls.size(), null_count, null_buf)); + + ASSERT_EQ(null_count, arr->null_count()); + ASSERT_EQ(5, null_buf->size()); + + ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get())); - ASSERT_EQ(null_buf->size(), 5); for (size_t i = 0; i < nulls.size(); ++i) { ASSERT_EQ(static_cast(nulls[i]), arr->IsNull(i)); } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 1726a2f27d82d..ee4ef66d11e26 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -17,6 +17,8 @@ #include "arrow/array.h" +#include + #include "arrow/util/buffer.h" namespace arrow { @@ -24,18 +26,17 @@ namespace arrow { // ---------------------------------------------------------------------- // Base array class -Array::Array(const TypePtr& type, int64_t length, +Array::Array(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { - Init(type, length, nulls); + Init(type, length, null_count, nulls); } -void Array::Init(const TypePtr& type, int64_t length, +void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { type_ = type; length_ = length; + null_count_ = null_count; nulls_ = nulls; - - nullable_ = type->nullable; if (nulls_) { null_bits_ = nulls_->data(); } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 0eaa28d528e37..3d748c1bad6f8 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -30,38 +30,49 @@ namespace arrow { class Buffer; // Immutable data array with some logical type and some length. Any memory is -// owned by the respective Buffer instance (or its parents). May or may not be -// nullable. +// owned by the respective Buffer instance (or its parents). // -// The base class only has a null array (if the data type is nullable) +// The base class is only required to have a nulls buffer if the null count is +// greater than 0 // // Any buffers used to initialize the array have their references "stolen". If // you wish to use the buffer beyond the lifetime of the array, you need to // explicitly increment its reference count class Array { public: - Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {} - Array(const TypePtr& type, int64_t length, + Array() : + null_count_(0), + length_(0), + nulls_(nullptr), + null_bits_(nullptr) {} + + Array(const TypePtr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); virtual ~Array() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr& nulls); + void Init(const TypePtr& type, int32_t length, int32_t null_count, + const std::shared_ptr& nulls); - // Determine if a slot if null. For inner loops. Does *not* boundscheck - bool IsNull(int64_t i) const { - return nullable_ && util::get_bit(null_bits_, i); + // Determine if a slot is null. For inner loops. Does *not* boundscheck + bool IsNull(int i) const { + return null_count_ > 0 && util::get_bit(null_bits_, i); } - int64_t length() const { return length_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + const TypePtr& type() const { return type_;} TypeEnum type_enum() const { return type_->type;} + const std::shared_ptr& nulls() const { + return nulls_; + } + protected: TypePtr type_; - bool nullable_; - int64_t length_; + int32_t null_count_; + int32_t length_; std::shared_ptr nulls_; const uint8_t* null_bits_; diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index cb85067315099..ba70add155186 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,34 +25,29 @@ namespace arrow { -Status ArrayBuilder::Init(int64_t capacity) { +Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; - - if (nullable_) { - int64_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared(pool_); - RETURN_NOT_OK(nulls_->Resize(to_alloc)); - null_bits_ = nulls_->mutable_data(); - memset(null_bits_, 0, to_alloc); - } + int32_t to_alloc = util::ceil_byte(capacity) / 8; + nulls_ = std::make_shared(pool_); + RETURN_NOT_OK(nulls_->Resize(to_alloc)); + null_bits_ = nulls_->mutable_data(); + memset(null_bits_, 0, to_alloc); return Status::OK(); } -Status ArrayBuilder::Resize(int64_t new_bits) { - if (nullable_) { - int64_t new_bytes = util::ceil_byte(new_bits) / 8; - int64_t old_bytes = nulls_->size(); - RETURN_NOT_OK(nulls_->Resize(new_bytes)); - null_bits_ = nulls_->mutable_data(); - if (old_bytes < new_bytes) { - memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); - } +Status ArrayBuilder::Resize(int32_t new_bits) { + int32_t new_bytes = util::ceil_byte(new_bits) / 8; + int32_t old_bytes = nulls_->size(); + RETURN_NOT_OK(nulls_->Resize(new_bytes)); + null_bits_ = nulls_->mutable_data(); + if (old_bytes < new_bytes) { + memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); } -Status ArrayBuilder::Advance(int64_t elements) { - if (nullable_ && length_ + elements > capacity_) { +Status ArrayBuilder::Advance(int32_t elements) { + if (length_ + elements > capacity_) { return Status::Invalid("Builder must be expanded"); } length_ += elements; diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 456bb04ae090a..491b9133d2cca 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; // Base class for all data array builders class ArrayBuilder { @@ -40,8 +40,9 @@ class ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), type_(type), - nullable_(type_->nullable), - nulls_(nullptr), null_bits_(nullptr), + nulls_(nullptr), + null_count_(0), + null_bits_(nullptr), length_(0), capacity_(0) {} @@ -57,21 +58,21 @@ class ArrayBuilder { return children_.size(); } - int64_t length() const { return length_;} - int64_t capacity() const { return capacity_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + int32_t capacity() const { return capacity_;} // Allocates requires memory at this level, but children need to be // initialized independently - Status Init(int64_t capacity); + Status Init(int32_t capacity); - // Resizes the nulls array (if nullable) - Status Resize(int64_t new_bits); + // Resizes the nulls array + Status Resize(int32_t new_bits); // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use // this function responsibly. - Status Advance(int64_t elements); + Status Advance(int32_t elements); const std::shared_ptr& nulls() const { return nulls_;} @@ -83,15 +84,15 @@ class ArrayBuilder { MemoryPool* pool_; TypePtr type_; - bool nullable_; - // If the type is not nullable, then null_ is nullptr after initialization + // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr nulls_; + int32_t null_count_; uint8_t* null_bits_; // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; + int32_t length_; + int32_t capacity_; // Child value array builders. These are owned by this class std::vector > children_; diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2233a4f832a8c..0898c8e3e3aa3 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector* nulls) { } } +static inline int null_count(const std::vector& nulls) { + int result = 0; + for (size_t i = 0; i < nulls.size(); ++i) { + if (nulls[i] > 0) { + ++result; + } + } + return result; +} + std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { std::shared_ptr out; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 220f99f4e885a..12f19604c688d 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -57,11 +57,9 @@ struct LayoutType { // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -// -// Any data type can be nullable enum class TypeEnum: char { - // A degerate NULL type represented as 0 bytes/bits + // A degenerate NULL type represented as 0 bytes/bits NA = 0, // Little-endian integer types @@ -138,14 +136,12 @@ enum class TypeEnum: char { struct DataType { TypeEnum type; - bool nullable; - explicit DataType(TypeEnum type, bool nullable = true) - : type(type), nullable(nullable) {} + explicit DataType(TypeEnum type) + : type(type) {} virtual bool Equals(const DataType* other) { - return (this == other) || (this->type == other->type && - this->nullable == other->nullable); + return this == other || this->type == other->type; } virtual std::string ToString() const = 0; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 59ba61419417a..094b63f28988a 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -29,7 +29,7 @@ template struct CollectionType : public DataType { std::vector child_types_; - explicit CollectionType(bool nullable = true) : DataType(T, nullable) {} + CollectionType() : DataType(T) {} const TypePtr& child(int i) const { return child_types_[i]; diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index b4d62523c413a..d90883cb01871 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,12 +31,12 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(TypeEnum::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(TypeEnum::DATE), unit(unit) {} DateType(const DateType& other) - : DateType(other.unit, other.nullable) {} + : DateType(other.unit) {} static char const *name() { return "date"; @@ -58,12 +58,12 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(TypeEnum::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(TypeEnum::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) - : TimestampType(other.unit, other.nullable) {} + : TimestampType(other.unit) {} static char const *name() { return "timestamp"; diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index 91fd132408fe6..6c2b097a737c7 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true, bool nullable = true) - : DataType(TypeEnum::JSON_SCALAR, nullable), + explicit JSONScalar(bool dense = true) + : DataType(TypeEnum::JSON_SCALAR), dense(dense) {} }; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index abfc8a31b0daa..1d9ddbe607a41 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -44,11 +44,7 @@ TEST(TypesTest, TestListType) { std::shared_ptr vt = std::make_shared(); ListType list_type(vt); - ListType list_type_nn(vt, false); - ASSERT_EQ(list_type.type, TypeEnum::LIST); - ASSERT_TRUE(list_type.nullable); - ASSERT_FALSE(list_type_nn.nullable); ASSERT_EQ(list_type.name(), string("list")); ASSERT_EQ(list_type.ToString(), string("list")); @@ -132,8 +128,8 @@ TEST_F(TestListBuilder, TestBasics) { Done(); - ASSERT_TRUE(result_->nullable()); - ASSERT_TRUE(result_->values()->nullable()); + ASSERT_EQ(1, result_->null_count()); + ASSERT_EQ(0, result_->values()->null_count()); ASSERT_EQ(3, result_->length()); vector ex_offsets = {0, 3, 3, 7}; @@ -153,10 +149,6 @@ TEST_F(TestListBuilder, TestBasics) { } } -TEST_F(TestListBuilder, TestBasicsNonNullable) { -} - - TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 4ca0f13d53c6f..4190b53df01cd 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -40,8 +40,8 @@ struct ListType : public DataType { // List can contain any other logical value type TypePtr value_type; - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(TypeEnum::LIST, nullable), + explicit ListType(const TypePtr& value_type) + : DataType(TypeEnum::LIST), value_type(value_type) {} static char const *name() { @@ -56,21 +56,25 @@ class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} - ListArray(const TypePtr& type, int64_t length, std::shared_ptr offsets, - const ArrayPtr& values, std::shared_ptr nulls = nullptr) { - Init(type, length, offsets, values, nulls); + ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr nulls = nullptr) { + Init(type, length, offsets, values, null_count, nulls); } virtual ~ListArray() {} - void Init(const TypePtr& type, int64_t length, std::shared_ptr offsets, - const ArrayPtr& values, std::shared_ptr nulls = nullptr) { + void Init(const TypePtr& type, int32_t length, std::shared_ptr offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr nulls = nullptr) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast(offset_buf_->data()); values_ = values; - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); } // Return a shared pointer in case the requestor desires to share ownership @@ -108,7 +112,7 @@ class ListBuilder : public Int32Builder { value_builder_.reset(value_builder); } - Status Init(int64_t elements) { + Status Init(int32_t elements) { // One more than requested. // // XXX: This is slightly imprecise, because we might trigger null mask @@ -116,7 +120,7 @@ class ListBuilder : public Int32Builder { return Int32Builder::Init(elements + 1); } - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // Need space for the end offset RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); @@ -129,18 +133,15 @@ class ListBuilder : public Int32Builder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(T* values, int32_t length, uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; @@ -159,9 +160,10 @@ class ListBuilder : public Int32Builder { raw_buffer()[length_] = child_values->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), nulls_); + out->Init(type_, length_, values_, ArrayPtr(child_values), + null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -181,10 +183,10 @@ class ListBuilder : public Int32Builder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } - raw_buffer()[length_++] = value_builder_->length(); return Status::OK(); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 3484294a39f9a..93634432d5ccb 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -53,15 +53,12 @@ TEST(TypesTest, TestBytesType) { #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ - KLASS tp_nn(false); \ \ ASSERT_EQ(tp.type, TypeEnum::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ - ASSERT_TRUE(tp.nullable); \ - ASSERT_FALSE(tp_nn.nullable); \ \ - KLASS tp_copy = tp_nn; \ - ASSERT_FALSE(tp_copy.nullable); \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, TypeEnum::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -100,17 +97,16 @@ class TestPrimitiveBuilder : public TestBuilder { TestBuilder::SetUp(); type_ = Attrs::type(); - type_nn_ = Attrs::type(false); ArrayBuilder* tmp; ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast(tmp)); - ASSERT_OK(make_builder(pool_, type_nn_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_nn_.reset(static_cast(tmp)); } - void RandomData(int64_t N, double pct_null = 0.1) { + void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); random_nulls(N, pct_null, &nulls_); } @@ -118,28 +114,33 @@ class TestPrimitiveBuilder : public TestBuilder { void CheckNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_->length(); + int size = builder_->length(); - auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), + auto ex_data = std::make_shared( + reinterpret_cast(draws_.data()), size * sizeof(T)); auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); - expected.Init(size, ex_data, ex_nulls); + int32_t ex_null_count = null_count(nulls_); + + expected.Init(size, ex_data, ex_null_count, ex_nulls); ASSERT_OK(builder_->Transfer(&result)); // Builder is now reset ASSERT_EQ(0, builder_->length()); ASSERT_EQ(0, builder_->capacity()); + ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(ex_null_count, result.null_count()); } void CheckNonNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_nn_->length(); + int size = builder_nn_->length(); auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); @@ -153,6 +154,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(nullptr, builder_nn_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(0, result.null_count()); } protected: @@ -171,14 +173,14 @@ class TestPrimitiveBuilder : public TestBuilder { typedef CapType##Type Type; \ typedef c_type T; \ \ - static TypePtr type(bool nullable = true) { \ - return TypePtr(new Type(nullable)); \ + static TypePtr type() { \ + return TypePtr(new Type()); \ } #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type); \ - static void draw(int64_t N, vector* draws) { \ + static void draw(int N, vector* draws) { \ randint(N, LOWER, UPPER, draws); \ } \ } @@ -208,7 +210,7 @@ TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); TYPED_TEST(TestPrimitiveBuilder, TestInit) { DECL_T(); - int64_t n = 1000; + int n = 1000; ASSERT_OK(this->builder_->Init(n)); ASSERT_EQ(n, this->builder_->capacity()); ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 2612e8ca7fd4a..c86260b0fc641 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,20 +26,23 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -void PrimitiveArray::Init(const TypePtr& type, int64_t length, +void PrimitiveArray::Init(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls) { - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); } bool PrimitiveArray::Equals(const PrimitiveArray& other) const { if (this == &other) return true; - if (type_->nullable != other.type_->nullable) return false; + if (null_count_ != other.null_count_) { + return false; + } bool equal_data = data_->Equals(*other.data_, length_); - if (type_->nullable) { + if (null_count_ > 0) { return equal_data && nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); } else { diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c5ae0f78a991b..aa8f351202a20 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -36,24 +36,24 @@ class MemoryPool; template struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() + : DataType(Derived::type_enum) {} virtual std::string ToString() const { return std::string(static_cast(this)->name()); } }; -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType() {} \ + \ + static const char* name() { \ + return NAME; \ } @@ -64,7 +64,9 @@ class PrimitiveArray : public Array { virtual ~PrimitiveArray() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr& data, + void Init(const TypePtr& type, int32_t length, + const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); const std::shared_ptr& data() const { return data_;} @@ -84,15 +86,17 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} - PrimitiveArrayImpl(int64_t length, const std::shared_ptr& data, + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - Init(length, data, nulls); + Init(length, data, null_count, nulls); } - void Init(int64_t length, const std::shared_ptr& data, + void Init(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - TypePtr type(new TypeClass(nulls != nullptr)); - PrimitiveArray::Init(type, length, data, nulls); + TypePtr type(new TypeClass()); + PrimitiveArray::Init(type, length, data, null_count, nulls); } bool Equals(const PrimitiveArrayImpl& other) const { @@ -101,7 +105,7 @@ class PrimitiveArrayImpl : public PrimitiveArray { const T* raw_data() const { return reinterpret_cast(raw_data_);} - T Value(int64_t i) const { + T Value(int i) const { return raw_data()[i]; } @@ -124,7 +128,7 @@ class PrimitiveBuilder : public ArrayBuilder { virtual ~PrimitiveBuilder() {} - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // XXX: Set floor size for now if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; @@ -135,27 +139,26 @@ class PrimitiveBuilder : public ArrayBuilder { } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); RETURN_NOT_OK(values_->Resize(capacity * elsize_)); - capacity_ = capacity; } + capacity_ = capacity; return Status::OK(); } - Status Init(int64_t capacity) { + Status Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared(pool_); return values_->Resize(capacity * elsize_); } - Status Reserve(int64_t elements) { + Status Reserve(int32_t elements) { if (length_ + elements > capacity_) { - int64_t new_capacity = util::next_power2(length_ + elements); + int32_t new_capacity = util::next_power2(length_ + elements); return Resize(new_capacity); } return Status::OK(); } - Status Advance(int64_t elements) { + Status Advance(int32_t elements) { return ArrayBuilder::Advance(elements); } @@ -165,8 +168,9 @@ class PrimitiveBuilder : public ArrayBuilder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } raw_buffer()[length_++] = val; return Status::OK(); @@ -176,42 +180,49 @@ class PrimitiveBuilder : public ArrayBuilder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(const T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(const T* values, int32_t length, + const uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int64_t i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; return Status::OK(); } - Status AppendNull() { - if (!nullable_) { - return Status::Invalid("not nullable"); + // Write nulls as uint8_t* into pre-allocated memory + void AppendNulls(const uint8_t* null_bytes, int32_t length) { + // If null_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + if (static_cast(null_bytes[i])) { + ++null_count_; + util::set_bit(null_bits_, length_ + i); + } } + } + + Status AppendNull() { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - util::set_bit(null_bits_, length_++, true); + ++null_count_; + util::set_bit(null_bits_, length_++); return Status::OK(); } // Initialize an array type instance with the results of this builder // Transfers ownership of all buffers Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, nulls_); + out->Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -236,7 +247,7 @@ class PrimitiveBuilder : public ArrayBuilder { protected: std::shared_ptr values_; - int64_t elsize_; + int elsize_; }; } // namespace arrow diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index a2d87ead59c59..e1dcebe97f013 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -39,7 +39,6 @@ TEST(TypesTest, TestCharType) { CharType t1(5); ASSERT_EQ(t1.type, TypeEnum::CHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); @@ -47,7 +46,6 @@ TEST(TypesTest, TestCharType) { // Test copy constructor CharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::CHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); } @@ -56,7 +54,6 @@ TEST(TypesTest, TestVarcharType) { VarcharType t1(5); ASSERT_EQ(t1.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.physical_type.size, 6); @@ -65,19 +62,14 @@ TEST(TypesTest, TestVarcharType) { // Test copy constructor VarcharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - StringType str_nn(false); - ASSERT_EQ(str.type, TypeEnum::STRING); ASSERT_EQ(str.name(), std::string("string")); - ASSERT_TRUE(str.nullable); - ASSERT_FALSE(str_nn.nullable); } // ---------------------------------------------------------------------- @@ -96,7 +88,7 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int64_t nchars = chars_.size(); + int nchars = chars_.size(); value_buf_ = to_buffer(chars_); values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); @@ -104,7 +96,9 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = to_buffer(offsets_); nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); - strings_.Init(length_, offsets_buf_, values_, nulls_buf_); + null_count_ = null_count(nulls_); + + strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_); } protected: @@ -118,7 +112,8 @@ class TestStringContainer : public ::testing::Test { std::shared_ptr offsets_buf_; std::shared_ptr nulls_buf_; - int64_t length_; + int null_count_; + int length_; ArrayPtr values_; StringArray strings_; @@ -127,7 +122,7 @@ class TestStringContainer : public ::testing::Test { TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_.length()); - ASSERT_TRUE(strings_.nullable()); + ASSERT_EQ(1, strings_.null_count()); } TEST_F(TestStringContainer, TestType) { @@ -149,7 +144,8 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { - auto arr = std::make_shared(length_, offsets_buf_, values_, nulls_buf_); + auto arr = std::make_shared(length_, offsets_buf_, values_, + null_count_, nulls_buf_); } TEST_F(TestStringContainer, TestGetString) { @@ -189,10 +185,6 @@ class TestStringBuilder : public TestBuilder { std::unique_ptr result_; }; -TEST_F(TestStringBuilder, TestAttrs) { - ASSERT_FALSE(builder_->value_builder()->nullable()); -} - TEST_F(TestStringBuilder, TestScalarAppend) { std::vector strings = {"a", "bb", "", "", "ccc"}; std::vector is_null = {0, 0, 0, 1, 0}; @@ -212,10 +204,11 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * null_count(is_null), result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); - int64_t length; - int64_t pos = 0; + int32_t length; + int32_t pos = 0; for (int i = 0; i < N * reps; ++i) { if (is_null[i % N]) { ASSERT_TRUE(result_->IsNull(i)); diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index f3dfbdc50f7a4..dea42e102b0d0 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -35,6 +35,6 @@ std::string VarcharType::ToString() const { return s.str(); } -TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type(false)); +TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index d0690d9a7d2a4..084562530a8fc 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -40,13 +40,13 @@ struct CharType : public DataType { BytesType physical_type; - explicit CharType(int size, bool nullable = true) - : DataType(TypeEnum::CHAR, nullable), + explicit CharType(int size) + : DataType(TypeEnum::CHAR), size(size), physical_type(BytesType(size)) {} CharType(const CharType& other) - : CharType(other.size, other.nullable) {} + : CharType(other.size) {} virtual std::string ToString() const; }; @@ -58,12 +58,12 @@ struct VarcharType : public DataType { BytesType physical_type; - explicit VarcharType(int size, bool nullable = true) - : DataType(TypeEnum::VARCHAR, nullable), + explicit VarcharType(int size) + : DataType(TypeEnum::VARCHAR), size(size), physical_type(BytesType(size + 1)) {} VarcharType(const VarcharType& other) - : VarcharType(other.size, other.nullable) {} + : VarcharType(other.size) {} virtual std::string ToString() const; }; @@ -73,11 +73,11 @@ static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(TypeEnum::STRING, nullable) {} + StringType() + : DataType(TypeEnum::STRING) {} StringType(const StringType& other) - : StringType(other.nullable) {} + : StringType() {} const LayoutPtr& physical_type() { return physical_string; @@ -98,17 +98,19 @@ class StringArray : public ListArray { public: StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} - StringArray(int64_t length, const std::shared_ptr& offsets, + StringArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - Init(length, offsets, values, nulls); + Init(length, offsets, values, null_count, nulls); } - void Init(const TypePtr& type, int64_t length, + void Init(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - ListArray::Init(type, length, offsets, values, nulls); + ListArray::Init(type, length, offsets, values, null_count, nulls); // TODO: type validation for values array @@ -117,23 +119,24 @@ class StringArray : public ListArray { raw_bytes_ = bytes_->raw_data(); } - void Init(int64_t length, const std::shared_ptr& offsets, + void Init(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - TypePtr type(new StringType(nulls != nullptr)); - Init(type, length, offsets, values, nulls); + TypePtr type(new StringType()); + Init(type, length, offsets, values, null_count, nulls); } // Compute the pointer t - const uint8_t* GetValue(int64_t i, int64_t* out_length) const { + const uint8_t* GetValue(int i, int32_t* out_length) const { int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return raw_bytes_ + pos; } // Construct a std::string - std::string GetString(int64_t i) const { - int64_t nchars; + std::string GetString(int i) const { + int32_t nchars; const uint8_t* str = GetValue(i, &nchars); return std::string(reinterpret_cast(str), nchars); } @@ -161,7 +164,7 @@ class StringBuilder : public ListBuilder { value.size()); } - Status Append(const uint8_t* value, int64_t length); + Status Append(const uint8_t* value, int32_t length); Status Append(const std::vector& values, uint8_t* null_bytes); diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 644b5457d5851..1a9fc6be4a5ce 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -43,11 +43,7 @@ TEST(TestStructType, Basics) { vector fields = {f0, f1, f2}; - StructType struct_type(fields, true); - StructType struct_type_nn(fields, false); - - ASSERT_TRUE(struct_type.nullable); - ASSERT_FALSE(struct_type_nn.nullable); + StructType struct_type(fields); ASSERT_TRUE(struct_type.field(0).Equals(f0)); ASSERT_TRUE(struct_type.field(1).Equals(f1)); diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 7d8885b830dba..afba19a7e4699 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -29,9 +29,8 @@ namespace arrow { struct StructType : public DataType { std::vector fields_; - StructType(const std::vector& fields, - bool nullable = true) - : DataType(TypeEnum::STRUCT, nullable) { + explicit StructType(const std::vector& fields) + : DataType(TypeEnum::STRUCT) { fields_ = fields; } diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 3ecb0dec7c04a..1744efce7d631 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -36,15 +36,13 @@ class TestBuilder : public ::testing::Test { void SetUp() { pool_ = GetDefaultMemoryPool(); type_ = TypePtr(new UInt8Type()); - type_nn_ = TypePtr(new UInt8Type(false)); builder_.reset(new UInt8Builder(pool_, type_)); - builder_nn_.reset(new UInt8Builder(pool_, type_nn_)); + builder_nn_.reset(new UInt8Builder(pool_, type_)); } protected: MemoryPool* pool_; TypePtr type_; - TypePtr type_nn_; unique_ptr builder_; unique_ptr builder_nn_; }; diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 7b66c3b88bf3c..62a3d1c10355d 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -33,9 +33,8 @@ class Buffer; struct DenseUnionType : public CollectionType { typedef CollectionType Base; - DenseUnionType(const std::vector& child_types, - bool nullable = true) - : Base(nullable) { + explicit DenseUnionType(const std::vector& child_types) : + Base() { child_types_ = child_types; } @@ -46,9 +45,8 @@ struct DenseUnionType : public CollectionType { struct SparseUnionType : public CollectionType { typedef CollectionType Base; - SparseUnionType(const std::vector& child_types, - bool nullable = true) - : Base(nullable) { + explicit SparseUnionType(const std::vector& child_types) : + Base() { child_types_ = child_types; } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index dbac0a42527be..292cb33887ffa 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -25,7 +25,9 @@ namespace arrow { void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { for (int i = 0; i < length; ++i) { - set_bit(bits, i, static_cast(bytes[i])); + if (static_cast(bytes[i])) { + set_bit(bits, i); + } } } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 9ae6127c5ea9c..841f617a3139c 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -41,8 +41,8 @@ static inline bool get_bit(const uint8_t* bits, int i) { return bits[i / 8] & (1 << (i % 8)); } -static inline void set_bit(uint8_t* bits, int i, bool is_set) { - bits[i / 8] |= (1 << (i % 8)) * is_set; +static inline void set_bit(uint8_t* bits, int i) { + bits[i / 8] |= 1 << (i % 8); } static inline int64_t next_power2(int64_t n) {