From 612f37120d131240a753804cd4e721c72f5b1964 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 18 Sep 2023 12:10:43 -0400 Subject: [PATCH] GH-37710: [C++][Integration] Add C++ Utf8View implementation --- cpp/src/arrow/array/array_base.cc | 4 + cpp/src/arrow/array/array_binary.cc | 32 ++ cpp/src/arrow/array/array_binary.h | 70 +++++ cpp/src/arrow/array/array_binary_test.cc | 224 ++++++++++++-- cpp/src/arrow/array/array_test.cc | 8 +- cpp/src/arrow/array/array_view_test.cc | 32 ++ cpp/src/arrow/array/assume_aligned.h | 40 +++ cpp/src/arrow/array/builder_base.cc | 17 +- cpp/src/arrow/array/builder_binary.cc | 76 ++++- cpp/src/arrow/array/builder_binary.h | 243 ++++++++++++++- cpp/src/arrow/array/builder_dict.cc | 6 + cpp/src/arrow/array/builder_dict.h | 10 + cpp/src/arrow/array/concatenate.cc | 39 +++ cpp/src/arrow/array/concatenate_test.cc | 23 ++ cpp/src/arrow/array/data.cc | 46 ++- cpp/src/arrow/array/data.h | 8 +- cpp/src/arrow/array/dict_internal.h | 31 ++ cpp/src/arrow/array/util.cc | 286 +++++++++++++++++- cpp/src/arrow/array/util.h | 22 ++ cpp/src/arrow/array/validate.cc | 139 +++++++-- cpp/src/arrow/buffer_builder.h | 7 +- cpp/src/arrow/builder.cc | 14 +- cpp/src/arrow/compare.cc | 40 ++- cpp/src/arrow/compute/kernels/vector_hash.cc | 95 ++---- .../engine/substrait/expression_internal.cc | 9 + .../arrow/engine/substrait/type_internal.cc | 7 + cpp/src/arrow/integration/json_internal.cc | 200 ++++++++++-- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/feather_test.cc | 6 +- cpp/src/arrow/ipc/json_simple.cc | 4 + cpp/src/arrow/ipc/json_simple_test.cc | 10 +- cpp/src/arrow/ipc/metadata_internal.cc | 48 ++- cpp/src/arrow/ipc/metadata_internal.h | 6 +- cpp/src/arrow/ipc/read_write_test.cc | 91 +++--- cpp/src/arrow/ipc/reader.cc | 36 ++- cpp/src/arrow/ipc/test_common.cc | 53 ++-- cpp/src/arrow/ipc/test_common.h | 2 +- cpp/src/arrow/ipc/writer.cc | 24 +- cpp/src/arrow/ipc/writer.h | 1 + cpp/src/arrow/json/converter.cc | 2 + cpp/src/arrow/json/test_common.h | 10 +- cpp/src/arrow/scalar.cc | 19 +- cpp/src/arrow/scalar.h | 43 ++- cpp/src/arrow/testing/gtest_util.h | 7 + cpp/src/arrow/testing/random.cc | 31 +- cpp/src/arrow/testing/random.h | 16 + cpp/src/arrow/testing/random_test.cc | 1 + cpp/src/arrow/type.cc | 40 ++- cpp/src/arrow/type.h | 106 ++++++- cpp/src/arrow/type_fwd.h | 21 ++ cpp/src/arrow/type_test.cc | 12 + cpp/src/arrow/type_traits.cc | 2 + cpp/src/arrow/type_traits.h | 53 +++- cpp/src/arrow/util/binary_view_util.h | 207 +++++++++++++ cpp/src/arrow/util/sort.h | 21 +- cpp/src/arrow/util/string.cc | 10 + cpp/src/arrow/util/string.h | 4 +- cpp/src/arrow/visit_data_inline.h | 58 +++- cpp/src/arrow/visitor.cc | 8 +- cpp/src/arrow/visitor.h | 6 + cpp/src/arrow/visitor_generate.h | 2 + cpp/src/generated/File_generated.h | 10 +- cpp/src/generated/Message_generated.h | 45 ++- cpp/src/generated/Schema_generated.h | 157 +++++++++- cpp/src/generated/SparseTensor_generated.h | 18 ++ cpp/src/generated/Tensor_generated.h | 16 + cpp/src/generated/feather_generated.h | 7 + cpp/src/parquet/column_writer.cc | 2 + 68 files changed, 2604 insertions(+), 343 deletions(-) create mode 100644 cpp/src/arrow/array/assume_aligned.h create mode 100644 cpp/src/arrow/util/binary_view_util.h diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1cf..a97332ae8e200 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,10 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { + return Finish(a.GetString(index_)); + } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9d7..490164507e3b8 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -24,6 +24,7 @@ #include "arrow/array/validate.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -89,6 +90,37 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(data); +} + +BinaryViewArray::BinaryViewArray(int64_t length, std::shared_ptr headers, + BufferVector char_buffers, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) + : PrimitiveArray(binary_view(), length, std::move(headers), std::move(null_bitmap), + null_count, offset) { + data_->buffers.resize(char_buffers.size() + 2); + std::move(char_buffers.begin(), char_buffers.end(), data_->buffers.begin() + 2); +} + +std::string_view BinaryViewArray::GetView(int64_t i) const { + const auto& s = raw_values()[i]; + if (raw_pointers_) { + return util::FromRawPointerBinaryView(s); + } + const std::shared_ptr* data_buffers = data_->buffers.data() + 2; + return util::FromIndexOffsetBinaryView(s, data_buffers); +} + +StringViewArray::StringViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(data); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff841a..dfa56bf8e7e28 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,75 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(const std::shared_ptr& data); + + BinaryViewArray(int64_t length, std::shared_ptr headers, + BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const c_type* raw_values() const { + return reinterpret_cast(raw_values_) + data_->offset; + } + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + bool has_raw_pointers() const { return raw_pointers_; } + + protected: + using PrimitiveArray::PrimitiveArray; + + void SetData(const std::shared_ptr& data) { + PrimitiveArray::SetData(data); + raw_pointers_ = + internal::checked_cast(*type()).has_raw_pointers(); + } + + bool raw_pointers_ = false; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(const std::shared_ptr& data); + + StringViewArray(int64_t length, std::shared_ptr data, BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : BinaryViewArray(length, std::move(data), std::move(char_buffers), + std::move(null_bitmap), null_count, offset) { + data_->type = utf8_view(); + } + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a022a..e527115e13fc4 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,17 +27,21 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -365,38 +369,212 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +// Produce an Array of index/offset views from a std::vector of index/offset +// BinaryViewType::c_type +Result> MakeBinaryViewArray( + BufferVector char_buffers, const std::vector& headers, + bool validate = true) { + auto length = static_cast(headers.size()); + auto headers_buf = Buffer::FromVector(headers); + auto arr = std::make_shared(length, std::move(headers_buf), + std::move(char_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +// Produce an Array of index/offset views from a std::vector of raw pointer +// BinaryViewType::c_type +Result> MakeBinaryViewArrayFromRaw( + BufferVector char_buffers, const std::vector& raw, + bool validate = true) { + auto raw_buf = Buffer::FromVector(raw); + StringViewArray raw_arr{static_cast(raw.size()), std::move(raw_buf), + char_buffers}; + raw_arr.data()->type = utf8_view(/*has_raw_pointers=*/true); + + ARROW_ASSIGN_OR_RAISE(auto io_buf, + AllocateBuffer(raw.size() * sizeof(BinaryViewType::c_type))); + RETURN_NOT_OK(internal::SwapStringHeaderPointers( + *raw_arr.data(), io_buf->mutable_data_as())); + + auto arr = std::make_shared(raw.size(), std::move(io_buf), + std::move(char_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +TEST(StringViewArray, Validate) { + // Since this is a test of validation, we need to be able to construct invalid arrays. + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // empty array is valid + EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok()); + + // empty array with some character buffers is valid + EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeBinaryViewArray({}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + util::ToInlineBinaryView("inline me"), + }), + Ok()); + + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT(MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToIndexOffsetBinaryView("supe", buffer_s->size(), 0, 0), + util::ToIndexOffsetBinaryView("yyyy", buffer_y->size(), 1, 0)}), + Ok()); + + // views may not reference char buffers not present in the array + EXPECT_THAT(MakeBinaryViewArray( + {}, {util::ToIndexOffsetBinaryView("supe", buffer_s->size(), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced char buffer + EXPECT_THAT(MakeBinaryViewArray({buffer_s}, {util::ToIndexOffsetBinaryView( + "supe", buffer_s->size() + 50, 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the character buffer + EXPECT_THAT(MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToIndexOffsetBinaryView("SUPE", buffer_s->size(), 0, 0), + util::ToIndexOffsetBinaryView("yyyy", buffer_y->size(), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = + MakeBinaryViewArray({buffer_s}, + {util::ToIndexOffsetBinaryView("SUPE", buffer_s->size(), 0, 0), + util::ToIndexOffsetBinaryView("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT(MakeBinaryViewArray( + {buffer_s}, + { + util::ToIndexOffsetBinaryView("supe", buffer_s->size(), 0, 0), + util::ToIndexOffsetBinaryView("uper", buffer_s->size() - 1, 0, 1), + util::ToIndexOffsetBinaryView("perc", buffer_s->size() - 2, 0, 2), + util::ToIndexOffsetBinaryView("erca", buffer_s->size() - 3, 0, 3), + }), + Ok()); +} + +TEST(StringViewArray, BinaryViewArrayFromRawPointerViews) { + auto Roundtrip = [&](Result> maybe_arr) { + ARROW_ASSIGN_OR_RAISE(auto arr, maybe_arr); + + std::vector raw(arr->length()); + RETURN_NOT_OK(internal::SwapStringHeaderPointers(*arr->data(), raw.data())); + for (size_t i = 0; i < raw.size(); ++i) { + if (util::FromRawPointerBinaryView(raw[i]) != arr->GetView(i)) { + return Status::Invalid("Produced incorrect raw pointer headers"); + } + } + + BufferVector char_buffers{arr->data()->buffers.begin() + 2, + arr->data()->buffers.end()}; + ARROW_ASSIGN_OR_RAISE(auto round_tripped, + MakeBinaryViewArrayFromRaw(std::move(char_buffers), raw)); + + if (round_tripped->Equals(arr)) { + return Status::OK(); + } + return Status::Invalid("not equal"); + }; + + EXPECT_THAT(Roundtrip(MakeBinaryViewArray({}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + util::ToInlineBinaryView("inline me"), + })), + Ok()); + + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + EXPECT_THAT(Roundtrip(MakeBinaryViewArray( + {buffer_s, buffer_y}, + { + util::ToIndexOffsetBinaryView("supe", buffer_s->size(), 0, 0), + util::ToInlineBinaryView("hello"), + util::ToIndexOffsetBinaryView("yyyy", buffer_y->size(), 1, 0), + util::ToInlineBinaryView("world"), + util::ToIndexOffsetBinaryView("uper", buffer_s->size() - 1, 0, 1), + })), + Ok()); + + // use a larger number of buffers to test the binary search case + BufferVector buffers; + std::vector headers; + for (size_t i = 0; i < 40; ++i) { + buffers.push_back(Buffer::FromString(std::string(13, 'c'))); + headers.push_back(util::ToIndexOffsetBinaryView("cccc", 13, i, 0)); + } + EXPECT_THAT(Roundtrip(MakeBinaryViewArray(buffers, headers)), Ok()); + + EXPECT_THAT( + MakeBinaryViewArrayFromRaw( + {buffer_s, buffer_y}, + { + util::ToRawPointerBinaryView("not inlined, outside buffers"), + }), + Raises(StatusCode::IndexError, + testing::HasSubstr("pointed outside the provided character buffers"))); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); + } + + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(type(), values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK( + ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -880,14 +1058,24 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { public: using TypeClass = T; - void SetUp() override { type_ = TypeTraits::type_singleton(); } + void SetUp() override { + if constexpr (is_binary_view_like_type::value) { + type_ = TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + type_ = TypeTraits::type_singleton(); + } + } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +1092,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 0b82a82fbdb26..892f534f696b6 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -382,10 +382,12 @@ static std::vector> TestArrayUtilitiesAgainstTheseType float64(), binary(), large_binary(), + binary_view(), fixed_size_binary(3), decimal(16, 4), utf8(), large_utf8(), + utf8_view(), list(utf8()), list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull large_list(large_utf8()), @@ -596,12 +598,15 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(Buffer::FromString("long string; not inlined")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -642,13 +647,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { for (int64_t length : {16}) { for (auto scalar : scalars) { + ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e4029..97110ea97f3fc 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/assume_aligned.h b/cpp/src/arrow/array/assume_aligned.h new file mode 100644 index 0000000000000..d8c1646f3137a --- /dev/null +++ b/cpp/src/arrow/array/assume_aligned.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/type.h" + +namespace arrow::util { + +template +[[nodiscard]] constexpr P* AssumeAligned(P* ptr) { +#if __has_builtin(__builtin_assume_aligned) + return static_cast(__builtin_assume_aligned(ptr, N)); +#else + return ptr; +#endif +} + +template +[[nodiscard]] constexpr P* AssumeAlignedAs(P* ptr) { + return AssumeAligned(ptr); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 3000aea3e189a..76f0d9fc3f679 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -96,10 +96,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -117,7 +114,17 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_t::value || is_string_like_type::value, Status> + Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9c1..d0ae88851fd2f 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,6 +40,72 @@ namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) { + ARROW_CHECK(!checked_cast(*type).has_raw_pointers()); +} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0; + for (int64_t i = 0; i < length; i++) { + if (!values[i].is_inline()) { + out_of_line_total += static_cast(values[i].size()); + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + for (int64_t i = 0; i < length; i++) { + if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) { + UnsafeAppendNull(); + continue; + } + + if (checked_cast(array.type)->has_raw_pointers()) { + UnsafeAppend(util::FromRawPointerBinaryView(values[i])); + } else { + UnsafeAppend(util::FromIndexOffsetBinaryView( + values[i], array.buffers[2].data_as>())); + } + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + capacity_ = length_ = null_count_ = 0; + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary @@ -125,8 +191,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -173,10 +239,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81fd..6a055137a2ef8 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -36,6 +36,7 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -204,10 +205,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -463,6 +464,238 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer string view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment, + int64_t blocksize = kDefaultBlocksize) + : pool_(pool), blocksize_(blocksize), alignment_(alignment) {} + + void UnsafeAppend(BinaryViewType::c_type* v) { + memcpy(current_out_buffer_, v->raw.data, v->size()); + v->io = {v->size(), v->raw.prefix, static_cast(blocks_.size() - 1), + current_offset_}; + current_out_buffer_ += v->size(); + current_remaining_bytes_ -= v->size(); + current_offset_ += static_cast(v->size()); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + const int64_t blocksize_; + int64_t alignment_; + std::vector> blocks_; + + uint32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + auto s = util::ToRawPointerBinaryView(value, length); + if (!s.is_inline()) { + ARROW_RETURN_NOT_OK(ReserveData(length)); + } + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 4GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppend(util::ToRawPointerBinaryView(value, length)); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + void UnsafeAppend(BinaryViewType::c_type value) { + UnsafeAppendToBitmap(true); + if (!value.is_inline()) { + // String is stored out-of-line + data_heap_builder_.UnsafeAppend(&value); + } + data_builder_.UnsafeAppend(value); + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder @@ -498,7 +731,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -549,7 +782,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 061fb600412fd..c99a6faceebf1 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf309915b..25e39d8db04d2 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + // TODO: Consider working BinaryViewType::c_type throughout the hashing machinery to + // benefit from faster comparisons, reduced need to allocate memory + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + class DictionaryMemoTableImpl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f7549fa9d1d1a..d87703d0d8a8f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -230,6 +230,41 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + auto begin = in_data->buffers.begin() + 2; + auto end = in_data->buffers.end(); + + for (auto it = begin; it != end; ++it) { + out_->buffers.push_back(*it); + } + } + + ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, BinaryViewType::kSize)); + ARROW_ASSIGN_OR_RAISE(auto header_buffer, ConcatenateBuffers(header_buffers, pool_)); + + if (!type.has_raw_pointers()) { + auto* s = header_buffer->mutable_data_as(); + + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (s[i].is_inline()) continue; + s[i].io.buffer_index += preceding_buffer_count; + } + } + } + + out_->buffers[1] = std::move(header_buffer); + return Status::OK(); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); @@ -606,6 +641,10 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool return Status::Invalid("Must pass at least one array"); } + if (arrays.size() == 1) { + return arrays[0]; + } + // gather ArrayData of input arrays ArrayDataVector data(arrays.size()); for (size_t i = 0; i < arrays.size(); ++i) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731ffe..6cefd5d779d4c 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,8 +92,14 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } + ASSERT_OK(expected->ValidateFull()); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { @@ -155,6 +161,23 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK((**out).ValidateFull()); + }); + + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + const ArrayData& io = *(*out)->data(); + auto raw_buf = AllocateBuffer(io.buffers[1]->size()).ValueOrDie(); + ABORT_NOT_OK(internal::SwapStringHeaderPointers( + io, raw_buf->mutable_data_as())); + (*out)->data()->buffers[1] = std::move(raw_buf); + (*out)->data()->type = utf8_view(/*has_raw_pointers=*/true); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 79595ab7c7c31..5db4fab32c494 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -31,6 +31,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -187,7 +188,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -200,7 +201,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -215,6 +216,14 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of character buffers in the third buffer + this->buffers[2].data = + const_cast(reinterpret_cast(data.buffers.data() + 2)); + this->buffers[2].size = + static_cast(data.buffers.size() - 2) * sizeof(std::shared_ptr); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -247,6 +256,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -351,6 +362,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + + this->buffers[1].size = BinaryViewType::kSize; + this->buffers[1].data = scalar.scratch_space_; + auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; + if (scalar.is_valid) { + *view = checked_cast(type)->has_raw_pointers() + ? util::ToRawPointerBinaryView(std::string_view{*scalar.value}) + : util::ToIndexOffsetBinaryView(std::string_view{*scalar.value}, 0, 0); + this->buffers[2].data = + const_cast(reinterpret_cast(&scalar.value)); + this->buffers[2].size = sizeof(std::shared_ptr); + } else { + *view = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); @@ -722,7 +749,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -733,6 +761,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 8c6b250b71adf..db4c485534938 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -388,6 +388,7 @@ struct ARROW_EXPORT ArraySpan { ArraySpan(const ArrayData& data) { // NOLINT implicit conversion SetMembers(data); } + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } /// If dictionary-encoded, put dictionary in the first entry @@ -395,6 +396,7 @@ struct ARROW_EXPORT ArraySpan { /// \brief Populate ArraySpan to look like an array of length 1 pointing at /// the data members of a Scalar value + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! void FillFromScalar(const Scalar& value); void SetMembers(const ArrayData& data); @@ -472,10 +474,12 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; } else { - this->null_count = this->length; + this->null_count = 0; } } diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 5245c8d0ff313..836ea8dbc3a8b 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -156,6 +156,37 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + // Instead of defining a custom memo table for StringView we reuse BinaryType's, + // then convert to string view when we copy data out of the memo table. + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + BinaryViewBuilder builder(pool); + RETURN_NOT_OK(builder.Resize(memo_table.size() - start_offset)); + RETURN_NOT_OK(builder.ReserveData(memo_table.values_size())); + memo_table.VisitValues(static_cast(start_offset), + [&](std::string_view s) { builder.UnsafeAppend(s); }); + RETURN_NOT_OK(builder.FinishInternal(out)); + if (checked_cast(*type).has_raw_pointers()) { + // the builder produces index/offset string views, so swap to raw pointers + RETURN_NOT_OK( + SwapStringHeaderPointers(**out, (*out)->GetMutableValues(1))); + } + (*out)->type = type; + return Status::OK(); + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index e84ab404ad6e3..bd4d8ce6a84bb 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -271,6 +274,17 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::Invalid("Swapping endianness of ", type); + } + + // XXX: This requires knowledge of whether the array is being swapped to native endian + // or from it so that we know what size to trust when deciding whether something is an + // inline view. + return Status::NotImplemented("Swapping endianness of ", type); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -377,6 +391,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(BinaryViewType::c_type) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -496,6 +514,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -598,6 +621,11 @@ class RepeatedArrayFactory { RepeatedArrayFactory(MemoryPool* pool, const Scalar& scalar, int64_t length) : pool_(pool), scalar_(scalar), length_(length) {} + template + const auto& scalar() const { + return checked_cast::ScalarType&>(scalar_); + } + Result> Create() { RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this)); return out_; @@ -619,7 +647,7 @@ class RepeatedArrayFactory { template enable_if_t::value || is_temporal_type::value, Status> Visit( const T&) { - auto value = checked_cast::ScalarType&>(scalar_).value; + auto value = scalar().value; return FinishFixedWidth(&value, sizeof(value)); } @@ -630,8 +658,7 @@ class RepeatedArrayFactory { template enable_if_decimal Visit(const T&) { - using ScalarType = typename TypeTraits::ScalarType; - auto value = checked_cast(scalar_).value.ToBytes(); + auto value = scalar().value.ToBytes(); return FinishFixedWidth(value.data(), value.size()); } @@ -642,29 +669,38 @@ class RepeatedArrayFactory { template enable_if_base_binary Visit(const T&) { - std::shared_ptr value = - checked_cast::ScalarType&>(scalar_).value; + const std::shared_ptr& value = scalar().value; std::shared_ptr values_buffer, offsets_buffer; RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); + return Status::OK(); + } + + template + enable_if_binary_view_like Visit(const T& type) { + std::string_view value{*scalar().value}; + auto s = type.has_raw_pointers() ? util::ToRawPointerBinaryView(value) + : util::ToIndexOffsetBinaryView(value, 0, 0); + + RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s))); + if (!s.is_inline()) { + out_->data()->buffers.push_back(scalar().value); + } return Status::OK(); } template enable_if_var_size_list Visit(const T& type) { - using ScalarType = typename TypeTraits::ScalarType; using ArrayType = typename TypeTraits::ArrayType; - auto value = checked_cast(scalar_).value; - - ArrayVector values(length_, value); + ArrayVector values(length_, scalar().value); ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_)); std::shared_ptr offsets_buffer; - auto size = static_cast(value->length()); + auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); out_ = @@ -951,5 +987,231 @@ std::vector RechunkArraysConsistently( return rechunked_groups; } +namespace { +Status FromRawPointerStringHeaders(const ArraySpan& raw, + util::span> char_buffers, + BinaryViewType::c_type* io) { + DCHECK_NE(char_buffers.size(), 0); + + auto IsInBuffer = [](const Buffer& buffer, BinaryViewType::c_type s) { + return buffer.data() <= s.raw.data && + buffer.data() + buffer.size() >= s.raw.data + s.size(); + }; + + auto Write = [&](auto find_containing_buffer) { + // Given `find_containing_buffer` which looks up the index of a buffer containing + // a BinaryViewType::c_type, write an equivalent buffer of index/offset string views. + static const Buffer kEmptyBuffer{""}; + const Buffer* buffer_containing_previous_view = &kEmptyBuffer; + uint32_t buffer_index; + + auto* raw_ptr = raw.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + raw.buffers[0].data, raw.offset, raw.length, raw.null_count, + [&] { + if (raw_ptr->is_inline()) { + *io++ = *raw_ptr++; + return; + } + + // Copied to a local variable, so even if io == raw_ptr + // we can modify safely. + auto s = *raw_ptr++; + + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(!IsInBuffer(*buffer_containing_previous_view, s))) { + auto found = find_containing_buffer(s); + if (ARROW_PREDICT_FALSE(!found)) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = *found; + buffer_containing_previous_view = char_buffers[buffer_index].get(); + } + + auto offset = s.raw.data - char_buffers[buffer_index]->data(); + io++->io = { + s.raw.size, + s.raw.prefix, + buffer_index, + static_cast(offset), + }; + }, + [&] { + ++raw_ptr; + *io++ = {}; + }); + + if (!all_valid) { + return Status::IndexError( + "A header pointed outside the provided character buffers"); + } + return Status::OK(); + }; + + auto LinearSearch = [&](BinaryViewType::c_type s) -> std::optional { + uint32_t buffer_index = 0; + for (const auto& char_buffer : char_buffers) { + if (IsInBuffer(*char_buffer, s)) return buffer_index; + ++buffer_index; + } + return {}; + }; + + if (char_buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return Write(LinearSearch); + } + + auto sort_indices = ArgSort( + char_buffers, [](const auto& l, const auto& r) { return l->data() < r->data(); }); + + auto first_overlapping = std::adjacent_find( + sort_indices.begin(), sort_indices.end(), [&](uint32_t before, uint32_t after) { + return char_buffers[before]->data() + char_buffers[before]->size() <= + char_buffers[after]->data(); + }); + if (ARROW_PREDICT_FALSE(first_overlapping != sort_indices.end())) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return Write(LinearSearch); + } + + auto BinarySearch = [&](BinaryViewType::c_type s) -> std::optional { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + auto one_past_potential_super = + std::upper_bound(sort_indices.begin(), sort_indices.end(), s, + [&](const BinaryViewType::c_type& s, int64_t i) { + return IsInBuffer(*char_buffers[i], s); + }); + + if (ARROW_PREDICT_FALSE(one_past_potential_super == sort_indices.begin())) { + return {}; + } + + uint32_t buffer_index = *(one_past_potential_super - 1); + const Buffer& char_buffer = *char_buffers[buffer_index]; + if (ARROW_PREDICT_TRUE(IsInBuffer(char_buffer, s))) return buffer_index; + + return {}; + }; + + return Write(BinarySearch); +} + +Status ToRawPointerStringHeaders(const ArraySpan& io, + util::span> char_buffers, + BinaryViewType::c_type* raw) { + DCHECK_NE(char_buffers.size(), 0); + + uint32_t buffer_index = 0; + const uint8_t* buffer_data = char_buffers[0]->data(); + auto* io_ptr = io.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + io.buffers[0].data, io.offset, io.length, io.null_count, + [&] { + if (io_ptr->is_inline()) { + *raw++ = *io_ptr++; + return; + } + + // Copied to a local variable, so even if raw == io_ptr + // we can modify safely. + auto s = *io_ptr++; + + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(s.io.buffer_index != buffer_index)) { + if (ARROW_PREDICT_FALSE(s.io.buffer_index >= char_buffers.size())) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = s.io.buffer_index; + buffer_data = char_buffers[buffer_index]->data(); + } + raw++->raw = { + s.io.size, + s.io.prefix, + buffer_data + s.io.offset, + }; + }, + [&] { + ++io_ptr; + *raw++ = {}; + }); + + if (!all_valid) { + return Status::IndexError("A header pointed outside the provided character buffers"); + } + return Status::OK(); +} +} // namespace + +Status SwapStringHeaderPointers(const ArraySpan& in, BinaryViewType::c_type* out) { + util::span char_buffers{ + reinterpret_cast*>(in.buffers[2].data), + static_cast(in.buffers[2].size / sizeof(std::shared_ptr))}; + + if (char_buffers.size() == 0) { + // If there are no character buffers, then all string views must be inline. + // In this case the buffer does not require swizzling between pointers and + // index/offsets. + auto* in_ptr = in.GetValues(1); + + bool all_inline = true; + VisitNullBitmapInline( + in.buffers[0].data, in.offset, in.length, in.null_count, + [&] { + all_inline = all_inline && in_ptr->is_inline(); + auto s = *in_ptr++; + *out++ = s; + }, + [&] { + ++in_ptr; + *out++ = {}; + }); + if (ARROW_PREDICT_FALSE(!all_inline)) { + return Status::IndexError( + "A header was not inline when no character buffers were provided"); + } + return Status::OK(); + } + + return checked_cast(in.type)->has_raw_pointers() + ? FromRawPointerStringHeaders(in, char_buffers, out) + : ToRawPointerStringHeaders(in, char_buffers, out); +} + +void StringHeadersFromStrings(const ArraySpan& strings, BinaryViewType::c_type* s) { + auto* buffer_data = strings.buffers[2].data_as(); + VisitArraySpanInline( + strings, + [&](std::string_view v) { + *s++ = util::ToIndexOffsetBinaryView(v, 0, v.data() - buffer_data); + }, + [&] { *s++ = BinaryViewType::c_type{}; }); +} + +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, + BinaryViewType::c_type* s) { + VisitArraySpanInline( + strings, [&](std::string_view v) { *s++ = util::ToRawPointerBinaryView(v); }, + [&] { *s++ = BinaryViewType::c_type{}; }); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 9f34af0525d96..ea701f63b90d5 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -86,5 +86,27 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); +/// Convert between index/offset and raw pointer StringHeaders. +/// +/// This function can be used to overwrite a buffer of StringHeader if desired, +/// IE it is supported for `in.buffers[1].data == out`. +/// +/// Note that calling this function is not necessary if all StringHeaders happen to be +/// Inline; this is usually efficiently detectable by checking for an absence of any +/// character buffers. +/// +/// Will raise IndexError if a header views memory outside the provided character buffers. +ARROW_EXPORT +Status SwapStringHeaderPointers(const ArraySpan& in, BinaryViewType::c_type* out); + +/// Fill a buffer of index/offset StringHeader from a dense string array +ARROW_EXPORT +void StringHeadersFromStrings(const ArraySpan& strings, BinaryViewType::c_type* io); + +/// Fill a buffer of raw pointer StringHeader from a dense string array +ARROW_EXPORT +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, + BinaryViewType::c_type* raw); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0f2bd4583574e..b097f3e1010cb 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,41 +31,43 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } - - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); + } } }; @@ -74,10 +76,7 @@ struct BoundsChecker { int64_t min_value; int64_t max_value; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + Status Visit(const DataType&) { Unreachable("bounds checking of non integer type"); } template enable_if_integer Visit(const IntegerType&) { @@ -169,6 +168,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); @@ -248,6 +255,8 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -453,7 +462,14 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array " + "of type ", + type.ToString(), ", got ", data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), " buffers in array " "of type ", @@ -469,7 +485,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -595,6 +613,61 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t headers_byte_size = data.buffers[1]->size(); + int64_t required_headers = data.length + data.offset; + if (static_cast(headers_byte_size / BinaryViewType::kSize) < + required_headers) { + return Status::Invalid("Header buffer size (bytes): ", headers_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation) { + return Status::OK(); + } + + if (type.has_raw_pointers()) { + // TODO(bkietz) It may be preferable to extract the validation logic which is in + // the raw pointer - index/offset view array conversion functions and actually + // validate raw pointer view arrays, rather than giving up here + return Status::OK(); + } + + auto* s = data.GetValues(1); + for (int64_t i = 0; i < data.length; ++i, ++s) { + if (data.IsNull(i)) continue; + + if (s->is_inline()) continue; + + if (ARROW_PREDICT_FALSE(s->io.buffer_index >= data.buffers.size() - 2)) { + return Status::IndexError("String view at slot ", i, " references buffer ", + s->io.buffer_index, " but there are only ", + data.buffers.size() - 2, " character buffers"); + } + + size_t begin = s->io.offset, end = begin + s->size(); + const auto& buffer = data.buffers[s->io.buffer_index + 2]; + auto size = static_cast(buffer->size()); + if (ARROW_PREDICT_FALSE(end > size)) { + return Status::IndexError("String view at slot ", i, " references range ", begin, + "-", end, " of buffer ", s->io.buffer_index, + " but that buffer is only ", size, " bytes long"); + } + + auto* data = buffer->data() + begin; + if (ARROW_PREDICT_FALSE( + std::memcmp(data, s->inline_data(), BinaryViewType::kPrefixSize) != 0)) { + return Status::Invalid("String view at slot ", i, " has inlined prefix 0x", + HexEncode(s->inline_data(), BinaryViewType::kPrefixSize), + " but the out-of-line character data begins with 0x", + HexEncode(data, BinaryViewType::kPrefixSize)); + } + } + + return Status::OK(); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; @@ -796,7 +869,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } @@ -804,5 +878,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index e7eea64043ba8..a84c98b6b2491 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -143,7 +143,10 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } - void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); } + + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); @@ -268,7 +271,7 @@ class TypedBufferBuilder< template void UnsafeAppend(Iter values_begin, Iter values_end) { - int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto num_elements = static_cast(std::distance(values_begin, values_end)); auto data = mutable_data() + length(); bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); std::copy(values_begin, values_end, data); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db5578..a1bc398cca4b8 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -162,6 +164,11 @@ struct DictionaryBuilderCase { template Status CreateFor() { + if constexpr (is_binary_view_like_type::value) { + if (checked_cast(*value_type).has_raw_pointers()) { + return NotImplemented(*value_type); + } + } using AdaptiveBuilderType = DictionaryBuilder; if (dictionary != nullptr) { out->reset(new AdaptiveBuilderType(dictionary, pool)); @@ -190,7 +197,12 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { + if constexpr (is_binary_view_like_type::value) { + if (t.has_raw_pointers()) { + return NotImplemented(); + } + } out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e06..edac06c4925e2 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -38,6 +38,7 @@ #include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -261,6 +262,36 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + if (type.has_raw_pointers()) { + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!util::EqualRawPointerBinaryView(left_values[i], right_values[i])) { + return false; + } + } + return true; + }); + return Status::OK(); + } + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!util::EqualIndexOffsetBinaryView(left_values[i], right_values[i], + left_buffers, right_buffers)) { + return false; + } + } + return true; + }); + return Status::OK(); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -632,6 +663,12 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& left) { + const auto& right = checked_cast(right_); + result_ = left.has_raw_pointers() == right.has_raw_pointers(); + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -802,8 +839,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index a7bb2d88c291b..d75305b3e726d 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -31,6 +31,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -262,7 +263,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -502,39 +503,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -543,22 +518,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -567,22 +542,24 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -592,31 +569,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 0df8425609ff1..ef53234436efa 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -727,6 +727,15 @@ struct ScalarToProtoImpl { s); } + Status Visit(const StringViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); }, + s); + } + Status Visit(const BinaryViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); }, + s); + } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 1f9141f36ba6b..93d8642eb80b2 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -263,6 +263,13 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const StringViewType& t) { + return SetWith(&::substrait::Type::set_allocated_string); + } + Status Visit(const BinaryViewType& t) { + return SetWith(&::substrait::Type::set_allocated_binary); + } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index ed7be4b502985..a0e4b3691d9d0 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -48,6 +48,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -106,6 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +std::string_view GetStringView(const rj::Value& str) { + DCHECK(str.IsString()); + return {str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -226,8 +232,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_var_length_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_var_length_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -382,6 +389,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -528,22 +537,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -642,6 +648,50 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteBinaryViewField(const ArrayType& array) { + writer_->Key(kData); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.is_inline()) { + writer_->Key("INLINED"); + if constexpr (ArrayType::TypeClass::is_utf8) { + writer_->String(reinterpret_cast(s.inline_data()), s.size()); + } else { + writer_->String(HexEncode(s.inline_data(), s.size())); + } + } else { + // Prefix is always 4 bytes so it may not be utf-8 even if the whole + // string view is + writer_->Key("PREFIX"); + writer_->String(HexEncode(s.inline_data(), BinaryViewType::kPrefixSize)); + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.io.buffer_index); + writer_->Key("OFFSET"); + writer_->Int64(s.io.offset); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the character buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -682,8 +732,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -700,6 +752,21 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + if (array.has_raw_pointers()) { + return Status::NotImplemented("serialization of ", array.type()->ToString()); + } + + WriteValidityField(array); + WriteBinaryViewField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1033,6 +1100,10 @@ Result> GetType(const RjObject& json_type, return utf8(); } else if (type_name == "binary") { return binary(); + } else if (type_name == "utf8view") { + return utf8_view(); + } else if (type_name == "binaryview") { + return binary_view(); } else if (type_name == "largeutf8") { return large_utf8(); } else if (type_name == "largebinary") { @@ -1332,6 +1403,97 @@ class ArrayReader { return FinishBuilder(&builder); } + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_)); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + auto hex_string = GetStringView(json_buf); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE( + buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), pool_)); + + span headers{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, s, is_valid] : Zip(json_views, headers, is_valid_)) { + if (!is_valid) { + s = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + DCHECK_GE(json_size->value.GetInt64(), 0); + auto size = static_cast(json_size->value.GetInt64()); + + if (size <= BinaryViewType::kInlineSize) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + s.inlined = {size, {}}; + + if constexpr (ViewType::is_utf8) { + DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize); + memcpy(&s.inlined.data, json_inlined->value.GetString(), size); + } else { + DCHECK_LE(json_inlined->value.GetStringLength(), + BinaryViewType::kInlineSize * 2); + RETURN_NOT_OK( + ParseHexValues(GetStringView(json_inlined->value), s.inlined.data.data())); + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + s.io = { + size, + {}, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64()), + }; + + DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2); + RETURN_NOT_OK( + ParseHexValues(GetStringView(json_prefix->value), s.io.prefix.data())); + + DCHECK_LE(s.io.buffer_index, buffers.size() - 2); + DCHECK_LE(static_cast(s.io.offset) + s.size(), + buffers[s.io.buffer_index + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8cbb..1ef076fac40e2 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb2635..0b6ae4f620647 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c9730283e..4d2d803f3f65e 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242aa..b67c26999945b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -271,7 +271,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { public: - std::shared_ptr type() { return TypeTraits::type_singleton(); } + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return T::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } + } }; TYPED_TEST_SUITE_P(TestStrings); @@ -327,6 +333,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5ce..23accc2390f85 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,6 +540,26 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + static Status CheckForRawPointers(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::NotImplemented( + type.ToString(), " cannot be serialized; convert to index/offset format first"); + } + return Status::OK(); + } + + Status Visit(const BinaryViewType& type) { + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return CheckForRawPointers(type); + } + + Status Visit(const StringViewType& type) { + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return CheckForRawPointers(type); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); @@ -820,7 +846,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos, dictionary_id = encoding->id(); } - // 4. Is it an extension type? + // 4. Is it an extension or view type? if (metadata != nullptr) { // Look for extension metadata in custom_metadata field int name_index = metadata->FindKey(kExtensionTypeKeyName); @@ -967,6 +993,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -977,7 +1004,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + auto fb_variadic_counts = fbb.CreateVector(variadic_counts); + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_counts); return Status::OK(); } @@ -1224,11 +1254,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1285,11 +1316,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dace0..631a336f75a9a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 69b827b8fe78d..7cd28e9e9e58d 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); @@ -2891,21 +2894,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 0def0e036e3c1..df1d0e3fbc1d4 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -59,6 +59,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" #include "arrow/util/vector.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" #include "generated/File_generated.h" // IWYU pragma: export @@ -242,6 +243,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicBufferCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -288,7 +298,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { out_->buffers.resize(3); @@ -344,7 +353,22 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); + } + + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -439,6 +463,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -569,10 +594,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData( - filtered_columns[i], context.options.memory_pool)); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20fbc..6faaf96b332d4 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,39 +351,32 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + auto AppendColumn = [&](auto& MakeArray) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c630..fc0c8ddbea319 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,7 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 1d230601566a0..beb4a4b09e7f8 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -174,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -296,6 +299,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -403,6 +408,18 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + auto headers = SliceBuffer(array.values(), array.offset() * BinaryViewType::kSize, + array.length() * BinaryViewType::kSize); + out_->body_buffers.emplace_back(std::move(headers)); + + out_->variadic_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); + } + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -590,6 +607,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -606,8 +624,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3f2..0b62c011d8892 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ceec..c393b77acf334 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdbc9..f7ab6fd10275f 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index b2ad1ad519bb2..167e272705268 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,6 +263,12 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } + + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template @@ -548,17 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 1d1ce4aa72948..5175b0128524c 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -263,24 +263,21 @@ struct ARROW_EXPORT BaseBinaryScalar return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} friend ArraySpan; + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -292,11 +289,39 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} + + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} + + std::string_view view() const override { return std::string_view(*this->value); } +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; + using TypeClass = StringViewType; + + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} + + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + + StringViewScalar() : StringViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; @@ -307,7 +332,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -319,7 +345,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index bb462af86a5f2..641aae5a5e2e4 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,10 +176,17 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b74c41f75e452..aeb056272a0c1 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -363,13 +363,12 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr +template static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -387,7 +386,7 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -430,6 +429,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView(int64_t size, int32_t min_length, + int32_t max_length, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { @@ -843,6 +851,23 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + + auto out = StringView(length, min_length, max_length, null_probability, alignment); + + if (internal::checked_cast(*field.type()) + .has_raw_pointers()) { + ABORT_NOT_OK(internal::SwapStringHeaderPointers( + *out->data(), out->data()->buffers[1]->mutable_data_as())); + } + return out->View(field.type()).ValueOrDie(); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index de9ea6d05648d..fa555b5f8308b 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] null_probability the probability of a value being null + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index f269818e83a3d..951b654e56f73 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 3d294a3fa8642..41da5655a7880 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -60,10 +60,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -126,6 +130,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, @@ -190,7 +196,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -243,8 +251,12 @@ struct PhysicalTypeVisitor { } template - Status Visit(const Type&) { - result = TypeTraits::type_singleton(); + Status Visit(const Type& type) { + if constexpr (std::is_base_of_v) { + result = binary_view(type.has_raw_pointers()); + } else { + result = TypeTraits::type_singleton(); + } return Status::OK(); } }; @@ -604,10 +616,18 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { + return raw_pointers_ ? "binary_view[RAW POINTERS]" : "binary_view"; +} + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { + return raw_pointers_ ? "string_view[RAW POINTERS]" : "string_view"; +} + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2362,8 +2382,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -2575,6 +2597,18 @@ TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + +const std::shared_ptr& binary_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } @@ -2829,7 +2863,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 718540d449226..67459fba1dc2c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -710,6 +717,84 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr size_t kSize = 16; + static constexpr size_t kInlineSize = 12; + static constexpr size_t kPrefixSize = 4; + union alignas(int64_t) c_type { + struct { + uint32_t size; + std::array data; + } inlined; + + struct { + uint32_t size; + std::array prefix; + uint32_t buffer_index; + uint32_t offset; + } io; + + struct { + uint32_t size; + std::array prefix; + const uint8_t* data; + } raw; + + /// The number of characters viewed. + uint32_t size() const { + // Size is in the common initial subsequence of the members of the union, + // so accessing `inlined.size` is legal even if another member is active. + return inlined.size; + } + + /// True if the view's data is entirely stored inline. + bool is_inline() const { return size() <= kInlineSize; } + + /// Return a pointer to the inline data of a view. + /// + /// For inline views, this points to the entire data of the view. + /// For other views, this points to the 4 byte prefix. + const uint8_t* inline_data() const& { + // `io` and `raw` have `prefix` in their common initial subsequence so it doesn't + // matter which we return here. Since they have the same address as `inlined.data`, + // the branch will be trivially optimized out. + return is_inline() ? inlined.data.data() : io.prefix.data(); + } + const uint8_t* inline_data() && = delete; + }; + static_assert(sizeof(c_type) == kSize); + + static constexpr const char* type_name() { return "binary_view"; } + + explicit BinaryViewType(bool has_raw_pointers = false) + : BinaryViewType(Type::BINARY_VIEW, has_raw_pointers) {} + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(kSize)}, + DataTypeLayout::VariableWidth()); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + bool has_raw_pointers() const { return raw_pointers_; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type, bool has_raw_pointers) + : DataType(logical_type), raw_pointers_(has_raw_pointers) {} + + bool raw_pointers_ = false; +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -756,6 +841,25 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + explicit StringViewType(bool has_raw_pointers = false) + : BinaryViewType(Type::STRING_VIEW, has_raw_pointers) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 499684719feab..88ecc45aaaa52 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -110,6 +110,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -125,6 +130,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -415,6 +425,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED = 38, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW, + // Leave this at the end MAX_ID }; @@ -456,10 +473,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(bool has_raw_pointers = false); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(bool has_raw_pointers = false); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index c55b33b4151e4..b4a624bb6d172 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1052,9 +1052,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index ac16afe4b8cd8..de328f322ad5f 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -88,6 +88,8 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::DURATION: case Type::INTERVAL_MONTH_DAY_NANO: // Stored as two 32-bit integers and a 64-bit // integer + case Type::STRING_VIEW: + case Type::BINARY_VIEW: return 8; case Type::DICTIONARY: case Type::EXTENSION: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index fdceca00a3f39..9ff4a858ec997 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +380,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +417,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,9 +637,28 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = - std::integral_constant::value && T::is_utf8>; + std::integral_constant::value && T::is_utf8) || + is_string_view_type::value>; template using enable_if_string_like = enable_if_t::value, R>; @@ -639,10 +681,9 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = - std::integral_constant::value && - !is_string_like_type::value) || - is_fixed_size_binary_type::value>; +using is_binary_like_type = std::integral_constant< + bool, (is_base_binary_type::value && !is_string_like_type::value) || + is_binary_view_type::value || is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; @@ -801,8 +842,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || + std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h new file mode 100644 index 0000000000000..1e14eb11fad1f --- /dev/null +++ b/cpp/src/arrow/util/binary_view_util.h @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/assume_aligned.h" +#include "arrow/type.h" + +namespace arrow::util { + +inline BinaryViewType::c_type ToInlineBinaryView(const void* data, uint32_t size) { + // Small string: inlined. Bytes beyond size are zeroed + BinaryViewType::c_type out; + out.inlined = {size, {}}; + memcpy(&out.inlined.data, data, size); + return out; +} + +inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { + return ToInlineBinaryView(v.data(), static_cast(v.size())); +} + +inline BinaryViewType::c_type ToRawPointerBinaryView(const void* data, uint32_t size) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + + // Large string: store pointer. + BinaryViewType::c_type out; + out.raw = {size, {}, static_cast(data)}; + memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix)); + return out; +} + +inline BinaryViewType::c_type ToRawPointerBinaryView(std::string_view v) { + return ToRawPointerBinaryView(v.data(), static_cast(v.size())); +} + +inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, uint32_t size, + uint32_t buffer_index, + uint32_t offset) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + + // Large string: store index/offset. + BinaryViewType::c_type out; + out.io = {size, {}, buffer_index, offset}; + memcpy(&out.raw.prefix, data, sizeof(out.raw.prefix)); + return out; +} + +inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v, + uint32_t buffer_index, + uint32_t offset) { + return ToIndexOffsetBinaryView(v.data(), static_cast(v.size()), buffer_index, + offset); +} + +inline std::string_view FromRawPointerBinaryView(const BinaryViewType::c_type& v) { + auto* data = v.is_inline() ? v.inlined.data.data() : v.raw.data; + return {reinterpret_cast(data), v.size()}; +} +inline std::string_view FromRawPointerBinaryView(BinaryViewType::c_type&&) = delete; + +template +std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v, + const BufferPtr* data_buffers) { + auto* data = v.is_inline() ? v.inlined.data.data() + : data_buffers[v.io.buffer_index]->data() + v.io.offset; + return {reinterpret_cast(data), v.size()}; +} +template +std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&, + const BufferPtr*) = delete; + +inline int CompareLongRawPointerBinaryViewSkipPrefixImpl(uint32_t size, + BinaryViewType::c_type l, + BinaryViewType::c_type r) { + return memcmp(l.raw.data + BinaryViewType::kPrefixSize, // + r.raw.data + BinaryViewType::kPrefixSize, // + size - BinaryViewType::kPrefixSize); +} + +template +inline int CompareLongIndexOffsetBinaryViewSkipPrefixImpl(uint32_t size, + BinaryViewType::c_type l, + BinaryViewType::c_type r, + const BufferPtr* l_buffers, + const BufferPtr* r_buffers) { + return memcmp( + l_buffers[l.io.buffer_index]->data() + l.io.offset + BinaryViewType::kPrefixSize, + r_buffers[r.io.buffer_index]->data() + r.io.offset + BinaryViewType::kPrefixSize, + size - BinaryViewType::kPrefixSize); +} + +template +int CompareBinaryViewImpl(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr*... buffers) { + int32_t l_prefix, r_prefix; + memcpy(&l_prefix, AssumeAlignedAs(l.inline_data()), sizeof(l_prefix)); + memcpy(&r_prefix, AssumeAlignedAs(r.inline_data()), sizeof(r_prefix)); + + if (l_prefix != r_prefix) { + // The result is decided on prefix. The shorter will be less + // because the prefix is padded with zeros. + // TODO(bkietz): ByteSwap(l_prefix) - ByteSwap(r_prefix) should be + // equivalent and faster than memcmp + return memcmp(l.inline_data(), r.inline_data(), BinaryViewType::kPrefixSize); + } + + uint32_t size = std::min(l.size(), r.size()); + if (size <= BinaryViewType::kPrefixSize) { + // One string is entirely contained in the prefix. + return l.size() - r.size(); + } + + if (size <= BinaryViewType::kInlineSize && l.is_inline() && r.is_inline()) { + // TODO(bkietz): The same word subtraction trick should be usable here + int result = memcmp(l.inline_data() + BinaryViewType::kPrefixSize, + r.inline_data() + BinaryViewType::kPrefixSize, + size - BinaryViewType::kPrefixSize); + return result != 0 ? result : l.size() - r.size(); + } + + int result; + if constexpr (sizeof...(buffers) == 0) { + result = CompareLongRawPointerBinaryViewSkipPrefixImpl(size, l, r); + } else { + static_assert(sizeof...(buffers) == 2); + result = CompareLongIndexOffsetBinaryViewSkipPrefixImpl(size, l, r, buffers...); + } + return result != 0 ? result : l.size() - r.size(); +} + +template +bool EqualBinaryViewImpl(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr*... buffers) { + int64_t l_size_and_prefix, r_size_and_prefix; + memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix)); + memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix)); + + if (l_size_and_prefix != r_size_and_prefix) return false; + + if (l.is_inline()) { + // The inline part is zeroed at construction, so we can compare + // a word at a time if data extends past 'prefix_'. + int64_t l_inlined, r_inlined; + memcpy(&l_inlined, + AssumeAlignedAs(l.inline_data() + BinaryViewType::kPrefixSize), + sizeof(l_size_and_prefix)); + memcpy(&r_inlined, + AssumeAlignedAs(r.inline_data() + BinaryViewType::kPrefixSize), + sizeof(r_size_and_prefix)); + return l_inlined == r_inlined; + } + + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + if constexpr (sizeof...(buffers) == 0) { + return CompareLongRawPointerBinaryViewSkipPrefixImpl(l.size(), l, r) == 0; + } else { + static_assert(sizeof...(buffers) == 2); + return CompareLongIndexOffsetBinaryViewSkipPrefixImpl(l.size(), l, r, buffers...) == + 0; + } +} + +inline int CompareRawPointerBinaryView(BinaryViewType::c_type l, + BinaryViewType::c_type r) { + return CompareBinaryViewImpl(l, r); +} + +template +int CompareIndexOffsetBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr* l_buffers, const BufferPtr* r_buffers) { + return CompareBinaryViewImpl(l, r, l_buffers, r_buffers); +} + +inline bool EqualRawPointerBinaryView(BinaryViewType::c_type l, + BinaryViewType::c_type r) { + return EqualBinaryViewImpl(l, r); +} + +template +bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr* l_buffers, const BufferPtr* r_buffers) { + return EqualBinaryViewImpl(l, r, l_buffers, r_buffers); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/sort.h b/cpp/src/arrow/util/sort.h index cdffe0b2317e5..6b01998ab59fc 100644 --- a/cpp/src/arrow/util/sort.h +++ b/cpp/src/arrow/util/sort.h @@ -24,18 +24,24 @@ #include #include -namespace arrow { -namespace internal { +#include "arrow/util/span.h" -template > -std::vector ArgSort(const std::vector& values, Cmp&& cmp = {}) { - std::vector indices(values.size()); +namespace arrow::internal { + +template > +std::vector ArgSort(arrow::util::span values, Cmp&& cmp = {}) { + std::vector indices(values.size()); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), - [&](int64_t i, int64_t j) -> bool { return cmp(values[i], values[j]); }); + [&](I i, I j) -> bool { return cmp(values[i], values[j]); }); return indices; } +template +std::vector ArgSort(const Range& values, Cmp&&... cmp) { + return ArgSort(arrow::util::span{values}, std::forward(cmp)...); +} + template size_t Permute(const std::vector& indices, std::vector* values) { if (indices.size() <= 1) { @@ -74,5 +80,4 @@ size_t Permute(const std::vector& indices, std::vector* values) { return cycle_count; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea22..07dfc2ce79f28 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -90,6 +90,16 @@ Status ParseHexValue(const char* data, uint8_t* out) { return Status::OK(); } +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } + return Status::OK(); +} + namespace internal { std::vector SplitString(std::string_view v, char delimiter, diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a8c..d7e377773f62f 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a635..b5c83db8b0a79 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -23,6 +23,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" @@ -144,6 +145,58 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + static const std::shared_ptr* GetCharBuffers(const ArraySpan& arr) { + return reinterpret_cast*>(arr.buffers[2].data); + } + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(util::FromRawPointerBinaryView(s[index])); }, + [&]() { return null_func(); }); + } else { + auto* char_buffers = GetCharBuffers(arr); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(util::FromIndexOffsetBinaryView(s[index], char_buffers)); }, + [&]() { return null_func(); }); + } + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(util::FromRawPointerBinaryView(s[index])); }, + std::forward(null_func)); + } else { + auto* char_buffers = GetCharBuffers(arr); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(util::FromIndexOffsetBinaryView(s[index], char_buffers)); }, + std::forward(null_func)); + } + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { @@ -273,9 +326,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68d7..e057f6b12fb1b 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567e1..650b0e7ee0a30 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8fea..4b57abe53ff14 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/generated/File_generated.h b/cpp/src/generated/File_generated.h index 5b219f1eb0ed1..06953c4a04044 100644 --- a/cpp/src/generated/File_generated.h +++ b/cpp/src/generated/File_generated.h @@ -26,18 +26,15 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS { int64_t bodyLength_; public: - Block() - : offset_(0), - metaDataLength_(0), - padding0__(0), - bodyLength_(0) { - (void)padding0__; + Block() { + memset(static_cast(this), 0, sizeof(Block)); } Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength) : offset_(flatbuffers::EndianScalar(_offset)), metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)), padding0__(0), bodyLength_(flatbuffers::EndianScalar(_bodyLength)) { + (void)padding0__; } /// Index to the start of the RecordBlock (note this is past the Message header) int64_t offset() const { @@ -122,6 +119,7 @@ struct FooterBuilder { : fbb_(_fbb) { start_ = fbb_.StartTable(); } + FooterBuilder &operator=(const FooterBuilder &); flatbuffers::Offset