diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index f27d95c4e8cd7..a23b94da08a56 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -182,7 +182,7 @@ jobs: if: github.event.comment.body == 'take' runs-on: ubuntu-latest steps: - - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 78b01b561f3cb..10b33c96d2129 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -53,7 +53,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -64,7 +64,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -75,7 +75,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: debug: true github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index 86d1858c8c596..02379a379ffce 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -33,7 +33,7 @@ jobs: if: github.event.issue.pull_request == null runs-on: ubuntu-latest steps: - - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | let split_body = context.payload.issue.body.split('### Component(s)'); diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 69adc184b7fe7..a29a859f850a0 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -48,7 +48,6 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - ubuntu: name: AMD64 Ubuntu 22.04 Java JDK ${{ matrix.jdk }} Maven ${{ matrix.maven }} runs-on: ubuntu-latest diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 76b10b828ee49..455e874cd4082 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -48,7 +48,6 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - docker: name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 596d3511a543d..27046d77c7799 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -40,7 +40,7 @@ jobs: - name: 'Download PR review payload' id: 'download' if: github.event_name == 'workflow_run' - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | const run_id = "${{ github.event.workflow_run.id }}"; diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index b5a38f01412d4..a5a012ad2c5c4 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -43,6 +43,12 @@ fi # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 +# Due to how Go reads environment variables, we have to set them from the calling +# process, or they would get ignored. +# (see https://forum.golangbridge.org/t/are-godebug-and-other-env-vars-ignored-when-loading-a-go-dll-from-foreign-code/33694) +export GOMEMLIMIT=200MiB +export GODEBUG=gctrace=1,clobberfree=1 + # Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 time archery integration \ --run-c-data \ diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 66ea8d677d187..77dd1ccdafa09 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -77,20 +77,20 @@ mvn="${mvn} -T 2C" pushd ${source_dir} -${mvn} install - if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then - ${mvn} -Pshade-flatbuffers install + mvn="${mvn} -Pshade-flatbuffers" fi if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then - ${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data install + mvn="${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data" fi if [ "${ARROW_JAVA_JNI}" = "ON" ]; then - ${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni install + mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni" fi +${mvn} install + if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference diff --git a/ci/scripts/rust_build.sh b/ci/scripts/rust_build.sh index 2dfc0f1b1892d..5fc21d454b080 100755 --- a/ci/scripts/rust_build.sh +++ b/ci/scripts/rust_build.sh @@ -21,6 +21,7 @@ set -e arrow_dir=${1} source_dir=${1}/rust +build_dir=${2}/rust # This file is used to build the rust binaries needed for the archery # integration tests. Testing of the rust implementation in normal CI is handled @@ -54,7 +55,7 @@ rustup show pushd ${source_dir} # build only the integration testing binaries -cargo build -p arrow-integration-testing +cargo build -p arrow-integration-testing --target-dir ${build_dir} # Save disk space by removing large temporary build products rm -rf target/debug/deps diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 24e8eefad1523..fde30588a659c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -229,6 +229,7 @@ set(ARROW_SRCS util/hashing.cc util/int_util.cc util/io_util.cc + util/list_util.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -790,6 +791,7 @@ add_arrow_test(array_test array/array_binary_test.cc array/array_dict_test.cc array/array_list_test.cc + array/array_list_view_test.cc array/array_run_end_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index eab71de27b11a..b483ec420cc3c 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -95,7 +95,7 @@ struct ScalarFromArraySlotImpl { Status Visit(const MonthDayNanoIntervalArray& a) { return Finish(a.Value(index_)); } template - Status Visit(const BaseListArray& a) { + Status Visit(const VarLengthListLikeArray& a) { return Finish(a.value_slice(index_)); } diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a3a2f99851b55..0b591d401804d 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -41,10 +41,11 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using ListTypes = ::testing::Types; +using ListAndListViewTypes = + ::testing::Types; // ---------------------------------------------------------------------- -// List tests +// List and ListView tests template class TestListArray : public ::testing::Test { @@ -57,7 +58,9 @@ class TestListArray : public ::testing::Test { using OffsetArrayType = typename TypeTraits::OffsetArrayType; using OffsetBuilderType = typename TypeTraits::OffsetBuilderType; - void SetUp() { + static constexpr bool kTypeClassIsListView = is_list_view_type::value; + + void SetUp() override { value_type_ = int16(); type_ = std::make_shared(value_type_); @@ -72,8 +75,10 @@ class TestListArray : public ::testing::Test { result_ = std::dynamic_pointer_cast(out); } - void ValidateBasicListArray(const ArrayType* result, const std::vector& values, - const std::vector& is_valid) { + private: + void DoValidateBasicListArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -108,6 +113,58 @@ class TestListArray : public ::testing::Test { result_->raw_value_offsets()[result->length()]); } + void DoValidateBasicListViewArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + std::vector ex_offsets = {0, 3, 3}; + std::vector ex_sizes = {3, 0}; + for (size_t i = 0; i < ex_sizes.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + ASSERT_EQ(ex_sizes[i], result->value_length(i)); + } + ASSERT_EQ(ex_offsets[ex_sizes.size()], result->value_offset(ex_sizes.size())); + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + auto varr = std::dynamic_pointer_cast(result->values()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } + + auto offsets = std::dynamic_pointer_cast(result->offsets()); + auto sizes = std::dynamic_pointer_cast(result->sizes()); + ASSERT_EQ(offsets->length(), result->length()); + ASSERT_EQ(offsets->null_count(), 0); + AssertTypeEqual(*offsets->type(), OffsetType()); + ASSERT_EQ(sizes->length(), result->length()); + ASSERT_EQ(sizes->null_count(), 0); + AssertTypeEqual(*sizes->type(), OffsetType()); + + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(offsets->Value(i), result_->raw_value_offsets()[i]); + ASSERT_EQ(sizes->Value(i), result_->raw_value_sizes()[i]); + } + } + + void ValidateBasicListArray(const ArrayType* result, const std::vector& values, + const std::vector& is_valid) { + if constexpr (kTypeClassIsListView) { + return DoValidateBasicListViewArray(result, values, is_valid); + } else { + return DoValidateBasicListArray(result, values, is_valid); + } + } + + public: void TestBasics() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector lengths = {3, 0, 4}; @@ -120,7 +177,7 @@ class TestListArray : public ::testing::Test { int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_valid[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0, lengths[i])); for (int j = 0; j < lengths[i]; ++j) { ASSERT_OK(vb->Append(values[pos++])); } @@ -133,25 +190,29 @@ class TestListArray : public ::testing::Test { void TestEquality() { auto vb = checked_cast(builder_->value_builder()); - std::shared_ptr array, equal_array, unequal_array; + std::shared_ptr array, equal_array; std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + std::vector equal_sizes = {1, 1, 3, 1, 1, 1, 2, 0}; std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + + std::shared_ptr unequal_array; std::vector unequal_offsets = {0, 1, 4, 7}; + std::vector unequal_sizes = {1, 3, 3, 0}; std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; - // setup two equal arrays - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - ASSERT_OK(builder_->Finish(&array)); - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); ASSERT_OK(builder_->Finish(&equal_array)); - // now an unequal one - ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); - ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); + ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_sizes.data(), + unequal_offsets.size())); + ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality @@ -197,16 +258,37 @@ class TestListArray : public ::testing::Test { EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); } - void TestFromArraysWithNullBitMap() { - std::shared_ptr offsets_w_nulls, offsets_wo_nulls, values; + private: + Result> FromArrays(const Array& offsets, const Array& sizes, + const Array& values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + if constexpr (kTypeClassIsListView) { + return ArrayType::FromArrays(offsets, sizes, values, pool_, null_bitmap, + null_count); + } else { + return ArrayType::FromArrays(offsets, values, pool_, null_bitmap, null_count); + } + } + + void TestFromArraysWithNullBitmap() { + std::shared_ptr offsets_w_nulls, offsets_wo_nulls; + std::shared_ptr sizes_w_nulls, sizes_wo_nulls; + std::shared_ptr values; std::vector offsets = {0, 1, 1, 3, 4}; + std::vector sizes = {1, 0, 2, 1}; std::vector offsets_w_nulls_is_valid = {true, false, true, true, true}; + std::vector sizes_w_nulls_is_valid = {true, false, true, true}; ArrayFromVector(offsets_w_nulls_is_valid, offsets, &offsets_w_nulls); ArrayFromVector(offsets, &offsets_wo_nulls); + ArrayFromVector(sizes_w_nulls_is_valid, sizes, + &sizes_w_nulls); + ArrayFromVector(sizes, &sizes_wo_nulls); + auto type = std::make_shared(int32()); auto expected = std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], null, [0, null], [0]]")); @@ -214,29 +296,41 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, - ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list - ASSERT_OK_AND_ASSIGN(result, - ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap - ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap - ASSERT_RAISES(Invalid, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_RAISES(Invalid, FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); + + if constexpr (kTypeClassIsListView) { + // Sizes with nulls will match. + ASSERT_OK_AND_ASSIGN(auto result, + FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Cannot specify both null sizes with null_bitmap + ASSERT_RAISES(Invalid, FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values, + expected->null_bitmap())); + } } - void TestFromArraysWithSlicedOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedOffsets() { std::vector offsets = {-1, -1, 0, 1, 2, 4}; std::shared_ptr offsets_wo_nulls; @@ -261,7 +355,8 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArraysWithSlicedNullOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedNullOffsets() { std::vector offsets = {-1, -1, 0, 1, 1, 3}; std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; @@ -288,7 +383,17 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArrays() { + public: + void TestFromArraysNullHandling() { + this->TestFromArraysWithNullBitmap(); + if constexpr (!kTypeClassIsListView) { + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); + } + } + + private: + void DoTestListFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; @@ -373,6 +478,87 @@ class TestListArray : public ::testing::Test { } } + template + std::enable_if_t DoTestListViewFromArrays() { + std::shared_ptr offsets1, offsets2; + std::shared_ptr sizes1, sizes2, sizes3, sizes4, sizes5; + std::shared_ptr values; + + std::vector sizes_is_valid3 = {true, false, true, true}; + std::vector sizes_is_valid4 = {true, true, false, true}; + std::vector sizes_is_valid5 = {true, true, false, false}; + + std::vector values_is_valid = {true, false, true, true, true, true}; + + std::vector offset1_values = {2, 0, 2}; + std::vector offset2_values = {2, 0, 6}; + std::vector size1_values = {0, 2, 4}; + std::vector size2_values = {4, 2, 0}; + + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); + + ArrayFromVector(size1_values, &sizes1); + ArrayFromVector(size2_values, &sizes2); + ArrayFromVector(sizes_is_valid3, size1_values, &sizes3); + ArrayFromVector(sizes_is_valid4, size2_values, &sizes4); + ArrayFromVector(sizes_is_valid5, size2_values, &sizes5); + + ArrayFromVector(values_is_valid, values_values, &values); + + auto list_type = std::make_shared(int8()); + + ASSERT_OK_AND_ASSIGN(auto list_view1, + ArrayType::FromArrays(*offsets1, *sizes1, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view3, + ArrayType::FromArrays(*offsets1, *sizes3, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view4, + ArrayType::FromArrays(*offsets2, *sizes4, *values, pool_)); + ASSERT_OK(list_view1->ValidateFull()); + ASSERT_OK(list_view3->ValidateFull()); + ASSERT_OK(list_view4->ValidateFull()); + + ArrayType expected1(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, offsets1->data()->buffers[0], + 0); + AssertArraysEqual(expected1, *list_view1); + + // Use null bitmap from sizes3, but clean sizes from non-null version + ArrayType expected3(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, sizes3->data()->buffers[0], + 1); + AssertArraysEqual(expected3, *list_view3); + + ArrayType expected4(list_type, length, offsets2->data()->buffers[1], + sizes2->data()->buffers[1], values, sizes4->data()->buffers[0], + 1); + AssertArraysEqual(expected4, *list_view4); + + // Test failure modes + + std::shared_ptr tmp; + + // Zero-length offsets (not a failure mode for ListViews) + ASSERT_OK(ArrayType::FromArrays(*offsets1->Slice(0, 0), *sizes1->Slice(0, 0), *values, + pool_)); + + // Offsets not the right type + ASSERT_RAISES(TypeError, + ArrayType::FromArrays(/*offsets=*/*values, *sizes1, *values, pool_)); + } + + public: + void TestFromArrays() { + if constexpr (kTypeClassIsListView) { + DoTestListViewFromArrays(); + } else { + DoTestListFromArrays(); + } + } + void TestAppendNull() { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -420,11 +606,13 @@ class TestListArray : public ::testing::Test { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector is_valid = {1, 0, 1}; std::vector offsets = {0, 3, 3}; + std::vector sizes = {3, 0, 1}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -434,16 +622,17 @@ class TestListArray : public ::testing::Test { void TestBulkAppendInvalid() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; std::vector is_valid = {1, 0, 1}; - // Should be {0, 3, 3} given the is_valid array std::vector offsets = {0, 2, 4}; + std::vector sizes = {2, 2, 4}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -466,7 +655,12 @@ class TestListArray : public ::testing::Test { builder_.reset(checked_cast(tmp.release())); std::vector offsets = {1, 2, 4, 8}; - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + std::vector sizes = {1, 2, 4}; + if constexpr (kTypeClassIsListView) { + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), sizes.size())); + } else { + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + } std::shared_ptr list_array; ASSERT_OK(builder_->Finish(&list_array)); @@ -485,10 +679,16 @@ class TestListArray : public ::testing::Test { void TestFlattenSimple() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( - ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); + ArrayFromJSON(type, "[[], null, [1, 2], [3], [4], null, [5], [], [6]]")); ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); + + list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [1, 2], [3], [4], [], [5], [], [6]]")); + ASSERT_OK_AND_ASSIGN(flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); } void TestFlattenNulls() { @@ -500,6 +700,35 @@ class TestListArray : public ::testing::Test { AssertTypeEqual(*flattened->type(), *value_type_); } + void TestFlattenAllEmpty() { + auto type = std::make_shared(int32()); + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [], [], [], [], []]")); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))); + + if constexpr (kTypeClassIsListView) { + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")); + auto array_data = list_array->data(); + + auto offsets = array_data->buffers[1]->template mutable_data_as(); + auto sizes = array_data->buffers[2]->template mutable_data_as(); + + // Set all sizes to 0, except the one for the null entry + memset(sizes, 0, sizeof(offset_type) * array_data->length); + sizes[2] = 4; + // Make the offset of the null entry be non-zero and out of order + offsets[2] = 1; + + ASSERT_OK(list_array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))) + << flattened->ToString(); + } + } + void TestFlattenSliced() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( @@ -520,7 +749,7 @@ class TestListArray : public ::testing::Test { std::dynamic_pointer_cast( ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")) ->data(); - ASSERT_EQ(2, array_data->buffers.size()); + ASSERT_EQ(kTypeClassIsListView ? 3 : 2, array_data->buffers.size()); auto null_bitmap_buffer = array_data->buffers[0]; ASSERT_NE(nullptr, null_bitmap_buffer); bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 1); @@ -534,20 +763,47 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } - Status ValidateOffsets(int64_t length, std::vector offsets, - const std::shared_ptr& values, int64_t offset = 0) { + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, + std::vector sizes, + std::shared_ptr values, int64_t offset = 0) { auto type = std::make_shared(values->type()); - ArrayType arr(type, length, Buffer::Wrap(offsets), values, + auto offsets_buffer = Buffer::Wrap(offsets.data(), sizes.size()); + auto sizes_buffer = Buffer::Wrap(sizes); + ArrayType arr(type, length, std::move(offsets_buffer), std::move(sizes_buffer), + std::move(values), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateFull(); } - void TestValidateOffsets() { + Status ValidateOffsets(int64_t length, std::vector offsets, + std::shared_ptr values, int64_t offset = 0) { + if constexpr (kTypeClassIsListView) { + std::vector sizes; + // Always reserve some space so Buffer::Wrap doesn't create a null buffer + // when length of the sizes buffer is 0. + sizes.reserve( + std::max(static_cast(1), offsets.empty() ? 0 : offsets.size() - 1)); + for (size_t i = 1; i < offsets.size(); ++i) { + sizes.push_back(offsets[i] - offsets[i - 1]); + } + return ValidateOffsetsAndSizes(length, std::move(offsets), std::move(sizes), + std::move(values), offset); + } else { + auto type = std::make_shared(values->type()); + ArrayType arr(type, length, Buffer::Wrap(offsets), std::move(values), + /*null_bitmap=*/nullptr, /*null_count=*/0, offset); + return arr.ValidateFull(); + } + } + + void TestValidateDimensions() { auto empty_values = ArrayFromJSON(int16(), "[]"); auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, 5, 6, 7]"); - // An empty list array can have omitted or 0-length offsets - ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + if constexpr (!kTypeClassIsListView) { + // An empty list array can have omitted or 0-length offsets + ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + } ASSERT_OK(ValidateOffsets(0, {0}, empty_values)); ASSERT_OK(ValidateOffsets(1, {0, 7}, values)); @@ -564,13 +820,24 @@ class TestListArray : public ::testing::Test { // Offset out of bounds ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + if constexpr (kTypeClassIsListView) { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 2)); + } else { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + } // Negative offset ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, values)); ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, -1, -1}, values, 1)); // Offsets non-monotonic ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 7, 4}, values)); + + if constexpr (kTypeClassIsListView) { + // Out of order offsets + ASSERT_OK(ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 6, 5}, values)); + + // Sizes out of bounds + ASSERT_RAISES(Invalid, ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 7, 5}, values)); + } } void TestCornerCases() { @@ -581,7 +848,7 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result_, *expected); SetUp(); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 0)); Done(); expected = ArrayFromJSON(type_, "[[]]"); AssertArraysEqual(*result_, *expected); @@ -602,7 +869,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); @@ -612,7 +879,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(vb->Append(3)); @@ -629,7 +896,7 @@ class TestListArray : public ::testing::Test { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestListArray, ListTypes); +TYPED_TEST_SUITE(TestListArray, ListAndListViewTypes); TYPED_TEST(TestListArray, Basics) { this->TestBasics(); } @@ -639,11 +906,7 @@ TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } -TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { - this->TestFromArraysWithNullBitMap(); - this->TestFromArraysWithSlicedOffsets(); - this->TestFromArraysWithSlicedNullOffsets(); -} +TYPED_TEST(TestListArray, FromArraysNullHandling) { this->TestFromArraysNullHandling(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } @@ -661,12 +924,13 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } +TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } -TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } +TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } @@ -676,6 +940,82 @@ TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck( TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } #endif +class TestListConversions : public ::testing::Test { + private: + MemoryPool* pool_; + + public: + TestListConversions() : pool_(default_memory_pool()) {} + + template + void DoTestListViewFromList() { + using DestListViewArrayClass = typename TypeTraits::ArrayType; + using SrcListArrayClass = typename TypeTraits::ArrayType; + auto list_type = std::make_shared(int32()); + auto list_view_type = std::make_shared(int32()); + + auto expected_list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_view_wo_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + std::shared_ptr list_w_nulls = + ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); + } + + template + void DoTestListFromListView() { + using SrcListViewArrayClass = typename TypeTraits::ArrayType; + using DestListArrayClass = typename TypeTraits::ArrayType; + auto list_view_type = std::make_shared(int32()); + auto list_type = std::make_shared(int32()); + + auto list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); + } +}; + +TEST_F(TestListConversions, ListViewFromList) { + this->DoTestListViewFromList(); + this->DoTestListViewFromList(); +} + +TEST_F(TestListConversions, ListFromListView) { + this->DoTestListFromListView(); + this->DoTestListFromListView(); +} + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc new file mode 100644 index 0000000000000..3e48191cedded --- /dev/null +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/util.h" +#include "arrow/pretty_print.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// List-view array tests + +namespace { + +class TestListViewArray : public ::testing::Test { + public: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + + void SetUp() override { + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + } + + static std::shared_ptr Offsets(std::string_view json) { + return ArrayFromJSON(int32(), json); + } + + static std::shared_ptr Sizes(std::string_view json) { + return ArrayFromJSON(int32(), json); + } +}; + +} // namespace + +TEST_F(TestListViewArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto list_view_array, + ListViewArray::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + auto array_data = list_view_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array); + // Should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); +} + +TEST_F(TestListViewArray, FromOffsetsAndSizes) { + std::shared_ptr list_view_array; + + ASSERT_OK_AND_ASSIGN(list_view_array, ListViewArray::FromArrays( + *Offsets("[0, 0, 1, 1000]"), + *Sizes("[2, 1, 1, null]"), *int32_values)); + ASSERT_EQ(list_view_array->length(), 4); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values); + ASSERT_EQ(list_view_array->offset(), 0); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 1); + ASSERT_EQ(list_view_array->data()->buffers.size(), 3); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index d8308c824953a..03f3e5af29908 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -27,6 +27,8 @@ #include "arrow/array/array_base.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -38,6 +40,7 @@ #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -48,7 +51,7 @@ using internal::checked_pointer_cast; using internal::CopyBitmap; // ---------------------------------------------------------------------- -// ListArray / LargeListArray (common utilities) +// ListArray / LargeListArray / ListViewArray / LargeListViewArray (common utilities) namespace { @@ -137,6 +140,77 @@ Result::ArrayType>> ListArrayFromArray return std::make_shared(std::move(data)); } +template +Result::ArrayType>> ListViewArrayFromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + using offset_type = typename TYPE::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + if (offsets.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); + } + + if (sizes.length() != offsets.length() && sizes.length() != offsets.length() - 1) { + return Status::Invalid( + "List sizes must have the same length as offsets or one less than offsets"); + } + if (sizes.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List sizes must be ", OffsetArrowType::type_name()); + } + + if (offsets.offset() != sizes.offset()) { + return Status::Invalid("List offsets and sizes must have the same offset"); + } + const int64_t array_offset = sizes.offset(); + + if (null_bitmap) { + if (offsets.null_count() > 0 || sizes.null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets or sizes with nulls"); + } + if (array_offset != 0) { + return Status::Invalid( + "List offsets and sizes must not be slices if a validity map is specified"); + } + } else { + if (offsets.null_count() > 0 && sizes.null_count() > 0) { + return Status::Invalid("Ambiguous to specify both offsets and sizes with nulls"); + } + } + + DCHECK(offsets.length() == sizes.length() || offsets.length() - 1 == sizes.length()); + + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + const auto& typed_sizes = checked_cast(sizes); + + auto derived_validity_buffer = std::move(null_bitmap); + if (offsets.null_count() > 0) { + derived_validity_buffer = offsets.null_bitmap(); + null_count = offsets.null_count(); + // We allow construction from an offsets array containing one extra value. + // If that is the case, we might need to discount one null from out_null_count. + if (offsets.length() - 1 == sizes.length() && !offsets.IsValid(sizes.length())) { + null_count -= 1; + } + } else if (sizes.null_count() > 0) { + derived_validity_buffer = sizes.null_bitmap(); + null_count = sizes.null_count(); + } + + auto buffers = BufferVector({ + std::move(derived_validity_buffer), + typed_offsets.values(), + typed_sizes.values(), + }); + auto data = ArrayData::Make(type, sizes.length(), std::move(buffers), {values.data()}, + null_count, array_offset); + return std::make_shared(std::move(data)); +} + static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, int64_t end) { return array.Slice(begin, end - begin); @@ -189,23 +263,199 @@ Result> FlattenListArray(const ListArrayT& list_array, return Concatenate(non_null_fragments, memory_pool); } +template +Result> FlattenListViewArray(const ListViewArrayT& list_view_array, + MemoryPool* memory_pool) { + using offset_type = typename ListViewArrayT::offset_type; + const int64_t list_view_array_offset = list_view_array.offset(); + const int64_t list_view_array_length = list_view_array.length(); + std::shared_ptr value_array = list_view_array.values(); + + if (list_view_array_length == 0) { + return SliceArrayWithOffsets(*value_array, 0, 0); + } + + // If the list array is *all* nulls, then just return an empty array. + if constexpr (HasNulls) { + if (list_view_array.null_count() == list_view_array.length()) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + } + + const auto* validity = list_view_array.data()->template GetValues(0, 0); + const auto* offsets = list_view_array.data()->template GetValues(1); + const auto* sizes = list_view_array.data()->template GetValues(2); + + auto is_null_or_empty = [&](int64_t i) { + if constexpr (HasNulls) { + if (!bit_util::GetBit(validity, list_view_array_offset + i)) { + return true; + } + } + return sizes[i] == 0; + }; + + // Index of the first valid, non-empty list-view. + int64_t first_i = 0; + for (; first_i < list_view_array_length; first_i++) { + if (!is_null_or_empty(first_i)) { + break; + } + } + // If all list-views are empty, return an empty array. + if (first_i == list_view_array_length) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + + std::vector> slices; + { + int64_t i = first_i; + auto begin_offset = offsets[i]; + auto end_offset = offsets[i] + sizes[i]; + i += 1; + // Inductive invariant: slices and the always non-empty values slice + // [begin_offset, end_offset) contains all the maximally contiguous slices of the + // values array that are covered by all the list-views before list-view i. + for (; i < list_view_array_length; i++) { + if (is_null_or_empty(i)) { + // The invariant is preserved by simply preserving the current set of slices. + } else { + if (offsets[i] == end_offset) { + end_offset += sizes[i]; + // The invariant is preserved because since the non-empty list-view i + // starts at end_offset, the current range can be extended to end at + // offsets[i] + sizes[i] (the same as end_offset + sizes[i]). + } else { + // The current slice can't be extended because the list-view i either + // shares values with the current slice or starts after the position + // immediately after the end of the current slice. + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + begin_offset = offsets[i]; + end_offset = offsets[i] + sizes[i]; + // The invariant is preserved because a maximally contiguous slice of + // the values array (i.e. one that can't be extended) was added to slices + // and [begin_offset, end_offset) is non-empty and contains the + // current list-view i. + } + } + } + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + } + + // Final attempt to avoid invoking Concatenate(). + switch (slices.size()) { + case 0: + return MakeEmptyArray(value_array->type(), memory_pool); + case 1: + return slices[0]; + } + + return Concatenate(slices, memory_pool); +} + std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, const ArrayData& data) { + const int64_t num_offsets = + is_list_view(data.type->id()) ? data.length : data.length + 1; std::vector> buffers = {nullptr, data.buffers[1]}; auto offsets_data = - std::make_shared(boxed_type, data.length + 1, std::move(buffers), + std::make_shared(boxed_type, /*length=*/num_offsets, std::move(buffers), /*null_count=*/0, data.offset); return MakeArray(offsets_data); } +std::shared_ptr BoxSizes(const std::shared_ptr& boxed_type, + const ArrayData& data) { + DCHECK(is_list_view(data.type->id())); + std::vector> buffers = {nullptr, data.buffers[2]}; + auto sizes_data = + std::make_shared(boxed_type, data.length, std::move(buffers), + /*null_count=*/0, data.offset); + return MakeArray(sizes_data); +} + +template +Result> ListViewFromListImpl( + const std::shared_ptr& list_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename SrcListType::offset_type; + const auto& list_type = checked_cast(*list_data->type); + + // To re-use the validity and offsets buffers, a sizes buffer with enough + // padding on the beginning is allocated and filled with the sizes after + // list_data->offset. + const int64_t buffer_length = list_data->offset + list_data->length; + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + AllocateBuffer(buffer_length * sizeof(offset_type), pool)); + const auto* offsets = list_data->template GetValues(1, 0); + auto* sizes = sizes_buffer->mutable_data_as(); + // Zero the initial padding area to avoid leaking any data when buffers are + // sent over IPC or throught the C Data interface. + memset(sizes, 0, list_data->offset * sizeof(offset_type)); + for (int64_t i = list_data->offset; i < buffer_length; i++) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], + std::move(sizes_buffer)}; + + return ArrayData::Make(std::make_shared(list_type.value_type()), + list_data->length, std::move(buffers), + {list_data->child_data[0]}, list_data->null_count, + list_data->offset); +} + +template +Result> ListFromListViewImpl( + const std::shared_ptr& list_view_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename DestListType::offset_type; + using ListBuilderType = typename TypeTraits::BuilderType; + + const auto& list_view_type = + checked_cast(*list_view_data->type); + const auto& value_type = list_view_type.value_type(); + const auto list_type = std::make_shared(value_type); + + ARROW_ASSIGN_OR_RAISE(auto sum_of_list_view_sizes, + list_util::internal::SumOfLogicalListSizes(*list_view_data)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, + MakeBuilder(value_type, pool)); + RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); + auto list_builder = std::make_shared(pool, value_builder, list_type); + RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); + + ArraySpan values{*list_view_data->child_data[0]}; + const auto* in_validity_bitmap = list_view_data->GetValues(0); + const auto* in_offsets = list_view_data->GetValues(1); + const auto* in_sizes = list_view_data->GetValues(2); + for (int64_t i = 0; i < list_view_data->length; ++i) { + const bool is_valid = + !in_validity_bitmap || + bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); + const int64_t size = is_valid ? in_sizes[i] : 0; + RETURN_NOT_OK(list_builder->Append(is_valid, size)); + RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); + } + std::shared_ptr list_array_data; + RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); + return list_array_data; +} + } // namespace namespace internal { template -inline void SetListData(BaseListArray* self, const std::shared_ptr& data, +inline void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id) { - ARROW_CHECK_EQ(data->buffers.size(), 2); + ARROW_CHECK_EQ(data->buffers.size(), is_list_view(TYPE::type_id) ? 3 : 2); ARROW_CHECK_EQ(data->type->id(), expected_type_id); ARROW_CHECK_EQ(data->child_data.size(), 1); @@ -214,6 +464,7 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrlist_type_ = checked_cast(data->type.get()); self->raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); + // BaseListViewArray::SetData takes care of setting raw_value_sizes_. ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); @@ -225,7 +476,9 @@ inline void SetListData(BaseListArray* self, const std::shared_ptr data) { SetData(std::move(data)); } +ListArray::ListArray(std::shared_ptr data) { + ListArray::SetData(std::move(data)); +} ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -250,6 +503,13 @@ Result> ListArray::FromArrays( values, pool, null_bitmap, null_count); } +Result> ListArray::FromListView(const ListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> ListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -273,7 +533,9 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { SetData(data); } +LargeListArray::LargeListArray(const std::shared_ptr& data) { + LargeListArray::SetData(data); +} LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +546,7 @@ LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t le auto internal_data = ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); + LargeListArray::SetData(internal_data); } void LargeListArray::SetData(const std::shared_ptr& data) { @@ -299,6 +561,14 @@ Result> LargeListArray::FromArrays( null_count); } +Result> LargeListArray::FromListView( + const LargeListViewArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> LargeListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -321,6 +591,144 @@ std::shared_ptr LargeListArray::offsets() const { return BoxOffsets(int64(), *data_); } +// ---------------------------------------------------------------------- +// ListViewArray + +ListViewArray::ListViewArray(std::shared_ptr data) { + ListViewArray::SetData(std::move(data)); +} + +ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + ListViewArray::SetData(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void ListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> ListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> ListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LIST_VIEW) { + return Status::TypeError("Expected list-view type, got ", type->ToString()); + } + const auto& list_view_type = checked_cast(*type); + if (!list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching list-view value type"); + } + return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, + pool, null_bitmap, null_count); +} + +Result> ListViewArray::FromList(const ListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> LargeListViewArray::FromList( + const LargeListArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr ListViewArray::offsets() const { + return BoxOffsets(int32(), *data_); +} + +std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), *data_); } + +// ---------------------------------------------------------------------- +// LargeListViewArray + +LargeListViewArray::LargeListViewArray(std::shared_ptr data) { + LargeListViewArray::SetData(std::move(data)); +} + +LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, + int64_t null_count, int64_t offset) { + LargeListViewArray::SetData(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void LargeListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> LargeListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> LargeListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LARGE_LIST_VIEW) { + return Status::TypeError("Expected large list-view type, got ", type->ToString()); + } + const auto& large_list_view_type = checked_cast(*type); + if (!large_list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching large list-view value type"); + } + return ListViewArrayFromArrays( + std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); +} + +Result> LargeListViewArray::Flatten( + MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr LargeListViewArray::offsets() const { + return BoxOffsets(int64(), *data_); +} + +std::shared_ptr LargeListViewArray::sizes() const { + return BoxSizes(int64(), *data_); +} + // ---------------------------------------------------------------------- // MapArray diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 8d5cc95fec00d..61606e1592d61 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and -// Union +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union #pragma once @@ -43,30 +43,31 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// ListArray +// VarLengthListLikeArray template -class BaseListArray; +class VarLengthListLikeArray; namespace internal { -// Private helper for ListArray::SetData. -// Unfortunately, trying to define BaseListArray::SetData outside of this header +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header // doesn't play well with MSVC. template -void SetListData(BaseListArray* self, const std::shared_ptr& data, +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); } // namespace internal -/// Base class for variable-sized list arrays, regardless of offset size. +/// Base class for variable-sized list and list-view arrays, regardless of offset size. template -class BaseListArray : public Array { +class VarLengthListLikeArray : public Array { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; - const TypeClass* list_type() const { return list_type_; } + const TypeClass* var_length_list_like_type() const { return this->list_type_; } /// \brief Return array object containing the list's values /// @@ -84,19 +85,26 @@ class BaseListArray : public Array { } // The following functions will not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - offset_type value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: - friend void internal::SetListData(BaseListArray* self, + friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -105,6 +113,29 @@ class BaseListArray : public Array { const offset_type* raw_value_offsets_ = NULLPTR; }; +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + i += this->data_->offset; + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: @@ -120,10 +151,13 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. /// - /// Offsets of an Array's null bitmap can be present or an explicit - /// null_bitmap, but not both. + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type @@ -143,6 +177,10 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -181,7 +219,13 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int64 type @@ -201,6 +245,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -216,6 +264,211 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { void SetData(const std::shared_ptr& data); }; +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// \brief Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// \brief Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { + return raw_value_sizes_ + this->data_->offset; + } + + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + return this->raw_value_sizes_[i + this->data_->offset]; + } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + // ---------------------------------------------------------------------- // MapArray @@ -319,10 +572,18 @@ class ARROW_EXPORT FixedSizeListArray : public Array { i += data_->offset; return list_size_ * i; } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) int32_t value_length(int64_t i = 0) const { ARROW_UNUSED(i); return list_size_; } + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 46908439ef5f0..be54d62fd77a7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -398,6 +398,8 @@ static std::vector> TestArrayUtilitiesAgainstTheseType large_list(list(large_utf8())), fixed_size_list(utf8(), 3), fixed_size_list(int64(), 4), + list_view(utf8()), + large_list_view(utf8()), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), sparse_union(union_fields1, union_type_codes), @@ -616,6 +618,8 @@ static ScalarVector GetScalars() { ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared( ScalarVector{ std::make_shared(2), @@ -752,9 +756,9 @@ TEST_F(TestArray, TestFillFromScalar) { ArraySpan span(*scalar); auto roundtripped_array = span.ToArray(); - AssertArraysEqual(*array, *roundtripped_array); - ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); AssertScalarsEqual(*scalar, *roundtripped_scalar); } @@ -3526,6 +3530,8 @@ DataTypeVector SwappableTypes() { large_utf8(), list(int16()), large_list(int16()), + list_view(int16()), + large_list_view(int16()), dictionary(int16(), utf8())}; } diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index d3502a0ab645a..40e705aa3e440 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -150,7 +150,8 @@ struct AppendScalarImpl { } template - enable_if_list_like Visit(const T&) { + enable_if_t::value || is_list_like_type::value, Status> Visit( + const T&) { auto builder = checked_cast::BuilderType*>(builder_); int64_t num_children = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { @@ -162,8 +163,12 @@ struct AppendScalarImpl { for (int64_t i = 0; i < n_repeats_; i++) { for (auto it = scalars_begin_; it != scalars_end_; ++it) { if (it->is_valid) { - RETURN_NOT_OK(builder->Append()); const Array& list = *checked_cast(*it).value; + if constexpr (T::type_id == Type::MAP || T::type_id == Type::FIXED_SIZE_LIST) { + RETURN_NOT_OK(builder->Append()); + } else { + RETURN_NOT_OK(builder->Append(/*is_valid=*/true, list.length())); + } for (int64_t i = 0; i < list.length(); i++) { ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index fbba1fd056430..5bdc76d96c8f0 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -30,6 +30,20 @@ namespace arrow { +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder / BaseListBuilder / BaseListViewBuilder + +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; + +template class BaseListBuilder; +template class BaseListBuilder; + +template class BaseListViewBuilder; +template class BaseListViewBuilder; + // ---------------------------------------------------------------------- // MapBuilder diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d0b17c230489b..21c2d4b270eb1 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -40,37 +40,46 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// List builder +// VarLengthListLikeBuilder template -class BaseListBuilder : public ArrayBuilder { +class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type, - int64_t alignment = kDefaultBufferAlignment) + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - int64_t alignment = kDefaultBufferAlignment) - : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {} + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), + alignment) {} + + ~VarLengthListLikeBuilder() override = default; Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { - return Status::CapacityError("List array cannot reserve space for more than ", + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", maximum_elements(), " got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); - // One more than requested for offsets - ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); return ArrayBuilder::Resize(capacity); } @@ -80,56 +89,98 @@ class BaseListBuilder : public ArrayBuilder { value_builder_->Reset(); } - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const offset_type* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); - } - /// \brief Start a new variable-length list slot /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true) { + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. + /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list + Status Append(bool is_valid, int64_t list_length) { ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); return Status::OK(); } - Status AppendNull() final { return Append(false); } + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } - Status AppendEmptyValue() final { return Append(true); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty + Status AppendEmptyValue() final { return Append(true, 0); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, true); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); @@ -137,43 +188,28 @@ class BaseListBuilder : public ArrayBuilder { const bool is_valid = all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || array.IsValid(row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); if (is_valid) { - int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], - offsets[row], slot_length)); + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } } return Status::OK(); } - Status FinishInternal(std::shared_ptr* out) override { - ARROW_RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets, null_bitmap; - ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); - } - - std::shared_ptr items; - ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - - *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, - null_count_); - Reset(); - return Status::OK(); - } - Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { - return Status::CapacityError("List array cannot contain more than ", + return Status::CapacityError(type_name(), " array cannot contain more than ", maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); @@ -191,20 +227,136 @@ class BaseListBuilder : public ArrayBuilder { return std::make_shared(value_field_->WithType(value_builder_->type())); } + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; std::shared_ptr value_field_; +}; + +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + +template +class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + using BASE::Append; + + ~BaseListBuilder() override = default; + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. +#ifndef NDEBUG + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } +#endif + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); - const int64_t num_values = value_builder_->length(); - return offsets_builder_.Append(static_cast(num_values)); + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); } - void UnsafeAppendNextOffset() { - const int64_t num_values = value_builder_->length(); - offsets_builder_.UnsafeAppend(static_cast(num_values)); + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); } }; @@ -247,6 +399,116 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + ~BaseListViewBuilder() override = default; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + // ---------------------------------------------------------------------- // Map builder diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 37c7271b5b95c..ff9ed66d1149f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -35,14 +35,17 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/int_util_overflow.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/slice_util_internal.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -98,10 +101,18 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, return Status::OK(); } +int64_t SumBufferSizesInBytes(const BufferVector& buffers) { + int64_t size = 0; + for (const auto& buffer : buffers) { + size += buffer->size(); + } + return size; +} + // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, @@ -113,33 +124,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, values_ranges->resize(buffers.size()); // allocate output buffer - int64_t out_length = 0; - for (const auto& buffer : buffers) { - out_length += buffer->size() / sizeof(Offset); - } - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer((out_length + 1) * sizeof(Offset), pool)); - auto dst = reinterpret_cast((*out)->mutable_data()); + const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size_in_bytes, pool)); + auto* out_data = (*out)->mutable_data_as(); int64_t elements_length = 0; Offset values_length = 0; for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(buffers[i], values_length, &dst[elements_length], - &(*values_ranges)[i])); + RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, + out_data + elements_length, &(*values_ranges)[i])); elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } - // the final element in dst is the length of all values spanned by the offsets - dst[out_length] = values_length; + // the final element in out_data is the length of all values spanned by the offsets + out_data[out_size_in_bytes / sizeof(Offset)] = values_length; return Status::OK(); } template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range) { - if (src->size() == 0) { + if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; @@ -148,8 +156,8 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse } // Get the range of offsets to transfer from src - auto src_begin = reinterpret_cast(src->data()); - auto src_end = reinterpret_cast(src->data() + src->size()); + auto src_begin = src.data_as(); + auto src_end = reinterpret_cast(src.data() + src.size()); // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; @@ -160,16 +168,132 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer +template +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, + const std::vector& value_ranges, + MemoryPool* pool, std::shared_ptr* out) { + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); + memset((*out)->mutable_data(), 0, static_cast((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < offset_buffers.size(); ++i) { + const auto displacement = + static_cast(num_child_values - value_ranges[i].offset); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, + static_cast(out_size_in_bytes / sizeof(offset_type))); + + return Status::OK(); +} + +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return Status::OK(); + } + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as(); + DCHECK_EQ(static_cast(src.size() / sizeof(offset_type)), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer ? validity_buffer->data_as() : nullptr; + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + visit_not_null(position); + } + } else if (block.NoneSet()) { + // NOTE: we don't have to do anything for the null entries regarding the + // offsets as the buffer is initialized to 0 when it is allocated. + + // Zero-out the sizes of the null entries to ensure these sizes are not + // greater than the new values length of the concatenated array. + memset(sizes + position, 0, block.length * sizeof(offset_type)); + position += block.length; + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(validity, input.offset + position)) { + visit_not_null(position); + } else { + // Zero-out the size at position. + sizes[position] = 0; + } + } + } + } + return Status::OK(); +} + class ConcatenateImpl { public: ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool) @@ -288,6 +412,41 @@ class ConcatenateImpl { return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); } + template + enable_if_list_view Visit(const T& type) { + using offset_type = typename T::offset_type; + out_->buffers.resize(3); + out_->child_data.resize(1); + + // Calculate the ranges of values that each list-view array uses + std::vector value_ranges; + value_ranges.reserve(in_.size()); + for (const auto& input : in_) { + ArraySpan input_span(*input); + Range range; + ARROW_ASSIGN_OR_RAISE(std::tie(range.offset, range.length), + list_util::internal::RangeOfValuesUsed(input_span)); + value_ranges.push_back(range); + } + + // Concatenate the values + ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); + RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + out_->child_data[0]->type = type.value_type(); + + // Concatenate the sizes first + ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2])); + + // Concatenate the offsets + ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, + value_ranges, pool_, &out_->buffers[1])); + + return Status::OK(); + } + Status Visit(const FixedSizeListType& fixed_size_list) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0ef1136ea78f8..af595e897f9ee 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -40,26 +41,55 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/list_util.h" namespace arrow { -class ConcatenateTest : public ::testing::Test { - protected: - ConcatenateTest() - : rng_(seed_), - sizes_({0, 1, 2, 4, 16, 31, 1234}), - null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} +class SimpleRandomArrayGenerator { + private: + random::SeedType seed_ = 0xdeadbeef; + std::default_random_engine random_engine_; + random::RandomArrayGenerator rag_; + + public: + SimpleRandomArrayGenerator() : random_engine_(seed_), rag_(seed_) {} + + template + std::vector RandomOffsetsInRange(offset_type min_offset, + offset_type max_offset, + int64_t num_offsets) { + std::vector offsets(static_cast(num_offsets)); + std::uniform_int_distribution dist(min_offset, max_offset); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(random_engine_); }); + return offsets; + } - template - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); - std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); - std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + template + std::vector Offsets(int32_t values_length, int32_t slice_count) { + auto offsets = RandomOffsetsInRange(0, values_length, slice_count + 1); std::sort(offsets.begin(), offsets.end()); return offsets; } + /// \param[in] random_offsets Random offsets in [0, values_size] and no particular order + template + std::vector ListViewSizes(const std::vector& random_offsets, + int64_t values_size, double avg_size, + int64_t num_sizes) { + std::normal_distribution normal(/*mean=*/avg_size, /*stddev=*/3.0); + std::vector sizes; + sizes.reserve(num_sizes); + for (int64_t i = 0; i < num_sizes; ++i) { + const auto sampled_size = std::llround(normal(random_engine_)); + auto size = std::max(0, static_cast(sampled_size)); + if (random_offsets[i] > values_size - size) { + size = static_cast(values_size - random_offsets[i]); + } + sizes.push_back(size); + } + return sizes; + } + ArrayVector Slices(const std::shared_ptr& array, const std::vector& offsets) { ArrayVector slices(offsets.size() - 1); @@ -69,33 +99,119 @@ class ConcatenateTest : public ::testing::Test { return slices; } + std::shared_ptr ValidityBitmap(int64_t size, double null_probability) { + return rag_.NullBitmap(size, null_probability, kDefaultBufferAlignment, + default_memory_pool()); + } + template - std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + std::shared_ptr PrimitiveArray(int64_t size, double null_probability) { if (std::is_same::value) { - return rng_.Boolean(size, 0.5, null_probability); + return rag_.Boolean(size, 0.5, null_probability); } - return rng_.Numeric(size, 0, 127, null_probability); + return rag_.Numeric(size, 0, 127, null_probability); + } + + std::shared_ptr StringArray(int64_t size, double null_probability) { + return rag_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + } + + std::shared_ptr LargeStringArray(int64_t size, double null_probability) { + return rag_.LargeString(size, /*min_length =*/0, /*max_length =*/15, + null_probability); + } + + std::shared_ptr StringViewArray(int64_t size, double null_probability) { + return rag_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + } + + std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, + double null_probability) { + return rag_.ArrayOf(std::move(type), size, null_probability); + } + + // TODO(GH-38656): Use the random array generators from testing/random.h here + + template ::ArrayType> + Result> ListArray(int32_t length, + double null_probability) { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + auto values_size = length * 4; + auto values = PrimitiveArray(values_size, null_probability); + auto offsets_vector = Offsets(values_size, length); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + return ListArrayType::FromArrays(*offsets, *values); } + template ::ArrayType> + Result> ListViewArray(int32_t length, + double null_probability) { + using offset_type = typename ListViewType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + constexpr int kAvgListViewSize = 4; + auto values_size = kAvgListViewSize * length; + + auto values = PrimitiveArray(values_size, null_probability); + + std::shared_ptr offsets; + auto offsets_vector = RandomOffsetsInRange(0, values_size, length); + ArrayFromVector(offsets_vector, &offsets); + + std::shared_ptr sizes; + auto sizes_vector = + ListViewSizes(offsets_vector, values_size, kAvgListViewSize, length); + ArrayFromVector(sizes_vector, &sizes); + + auto validity_bitmap = ValidityBitmap(length, null_probability); + auto valid_count = internal::CountSetBits(validity_bitmap->data(), 0, length); + + return ListViewArrayType::FromArrays( + *offsets, *sizes, *values, default_memory_pool(), + valid_count == length ? nullptr : std::move(validity_bitmap)); + } +}; + +class ConcatenateTest : public ::testing::Test { + private: + std::vector sizes_; + std::vector null_probabilities_; + + protected: + SimpleRandomArrayGenerator rag; + + ConcatenateTest() + : sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) - << length << " " << int(preceding_bits); + << length << " " << static_cast(preceding_bits); } } template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); + auto offsets = rag.Offsets(size, 3); for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); ASSERT_OK(expected->ValidateFull()); - auto slices = this->Slices(array, offsets); + auto slices = rag.Slices(array, offsets); for (auto slice : slices) { ASSERT_OK(slice->ValidateFull()); } @@ -111,11 +227,6 @@ class ConcatenateTest : public ::testing::Test { } } } - - random::SeedType seed_ = 0xdeadbeef; - random::RandomArrayGenerator rng_; - std::vector sizes_; - std::vector null_probabilities_; }; TEST(ConcatenateEmptyArraysTest, TestValueBuffersNullPtr) { @@ -144,7 +255,7 @@ TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->template GeneratePrimitive(size, null_probability); + *out = this->rag.template PrimitiveArray(size, null_probability); }); } @@ -156,23 +267,21 @@ TEST_F(ConcatenateTest, NullType) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.StringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, - /*max_buffer_length=*/200); + *out = rag.StringViewArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = - rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.LargeStringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } @@ -181,7 +290,7 @@ TEST_F(ConcatenateTest, FixedSizeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto list_size = 3; auto values_size = size * list_size; - auto values = this->GeneratePrimitive(values_size, null_probability); + auto values = this->rag.PrimitiveArray(values_size, null_probability); ASSERT_OK_AND_ASSIGN(*out, FixedSizeListArray::FromArrays(values, list_size)); ASSERT_OK((**out).ValidateFull()); }); @@ -189,39 +298,40 @@ TEST_F(ConcatenateTest, FixedSizeListType) { TEST_F(ConcatenateTest, ListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, ListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, this->rag.ListArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, LargeListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, ListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListViewArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, LargeListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN( + *out, this->rag.ListViewArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StructType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); + auto foo = this->rag.PrimitiveArray(size, null_probability); + auto bar = this->rag.PrimitiveArray(size, null_probability); + auto baz = this->rag.PrimitiveArray(size, null_probability); *out = std::make_shared( struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), size, ArrayVector{foo, bar, baz}); @@ -230,8 +340,8 @@ TEST_F(ConcatenateTest, StructType) { TEST_F(ConcatenateTest, DictionaryType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto indices = this->GeneratePrimitive(size, null_probability); - auto dict = this->GeneratePrimitive(128, 0); + auto indices = rag.PrimitiveArray(size, null_probability); + auto dict = rag.PrimitiveArray(128, 0); auto type = dictionary(int32(), dict->type()); *out = std::make_shared(type, indices, dict); }); @@ -382,20 +492,20 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(sparse_union({ - field("a", float64()), - field("b", boolean()), - }), - size, null_probability); + *out = rag.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(dense_union({ - field("a", uint32()), - field("b", boolean()), - field("c", int8()), - }), - size, null_probability); + *out = rag.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probability); }); } @@ -413,7 +523,7 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); auto offsets_ok = ArrayFromJSON(int32(), "[0]"); auto child_array_overflow = - this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + rag.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); ASSERT_OK_AND_ASSIGN( auto array_overflow, DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); @@ -546,7 +656,7 @@ TEST_F(ConcatenateTest, DenseUnionType) { TEST_F(ConcatenateTest, ExtensionType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto storage = this->GeneratePrimitive(size, null_probability); + auto storage = this->rag.PrimitiveArray(size, null_probability); *out = ExtensionType::WrapArray(smallint(), storage); }); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 186682be3009e..3241dc551853c 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -244,9 +244,22 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { auto* offsets = reinterpret_cast(scratch_space); offsets[0] = 0; offsets[1] = static_cast(value_size); + static_assert(2 * sizeof(offset_type) <= 16); return {scratch_space, sizeof(offset_type) * 2}; } +template +std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, + offset_type value_size) { + auto* offsets = scratch_space; + auto* sizes = scratch_space + sizeof(offset_type); + reinterpret_cast(offsets)[0] = 0; + reinterpret_cast(sizes)[0] = value_size; + static_assert(2 * sizeof(offset_type) <= 16); + return {BufferSpan{offsets, sizeof(offset_type)}, + BufferSpan{sizes, sizeof(offset_type)}}; +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -261,6 +274,8 @@ int GetNumBuffers(const DataType& type) { case Type::STRING_VIEW: case Type::BINARY_VIEW: case Type::DENSE_UNION: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: return 3; case Type::EXTENSION: // The number of buffers depends on the storage type @@ -381,7 +396,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); this->buffers[1].size = scalar.value->size(); - } else if (is_list_like(type_id)) { + } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); int64_t value_length = 0; @@ -402,7 +417,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); } else if (type_id == Type::LARGE_LIST) { this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + } else if (type_id == Type::LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + scalar.scratch_space_, static_cast(value_length)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = + OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); } else { + DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer this->buffers[1] = {}; } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index be9597e59b378..f9714eda34c61 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -289,6 +289,13 @@ class ValueComparatorFactory { Status Visit(const NullType&, const Array&, const Array&) { return Status::NotImplemented("null type"); } + Status Visit(const ListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } + + Status Visit(const LargeListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } Status Visit(const ExtensionType&, const Array&, const Array&) { return Status::NotImplemented("extension type"); @@ -589,6 +596,9 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::LIST_VIEW || + base.type()->id() == Type::LARGE_LIST_VIEW) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } @@ -732,6 +742,14 @@ class MakeFormatterImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + + Status Visit(const LargeListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + // TODO(bkietz) format maps better Status Visit(const StructType& t) { diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 9ea2fc2b6f0a1..86e2ffcae4de7 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -134,7 +134,6 @@ class ArrayDataEndianSwapper { out_->buffers[index] = data_->buffers[index]; return Status::OK(); } - // Except union, offset has one more element rather than data->length ARROW_ASSIGN_OR_RAISE(out_->buffers[index], ByteSwapBuffer(data_->buffers[index])); return Status::OK(); @@ -290,6 +289,17 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const ListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const LargeListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const DictionaryType& type) { // dictionary was already swapped in ReadDictionary() in ipc/reader.cc RETURN_NOT_OK(SwapType(*type.index_type())); @@ -379,7 +389,14 @@ class NullArrayFactory { enable_if_var_size_list Visit(const T& type) { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); - RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } @@ -518,8 +535,8 @@ class NullArrayFactory { } template - enable_if_var_size_list Visit(const T& type) { - out_->buffers.resize(2, buffer_); + enable_if_var_length_list_like Visit(const T& type) { + out_->buffers.resize(is_list_view(T::type_id) ? 3 : 2, buffer_); ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, /*length=*/0)); return Status::OK(); } @@ -698,12 +715,28 @@ class RepeatedArrayFactory { std::shared_ptr offsets_buffer; auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared(scalar_.type, length_, offsets_buffer, value_array); return Status::OK(); } + template + enable_if_list_view Visit(const T& type) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; + + auto value = checked_cast(scalar_).value; + + auto size = static_cast(value->length()); + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, + CreateIntBuffer(0)); + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + CreateIntBuffer(size)); + out_ = std::make_shared(scalar_.type, length_, std::move(offsets_buffer), + std::move(sizes_buffer), value); + return Status::OK(); + } + Status Visit(const FixedSizeListType& type) { auto value = checked_cast(scalar_).value; @@ -853,6 +886,15 @@ class RepeatedArrayFactory { return builder.Finish(out); } + template + Result> CreateIntBuffer(IntType value) { + std::shared_ptr buffer; + TypedBufferBuilder builder(pool_); + RETURN_NOT_OK(builder.Append(/*num_copies=*/length_, value)); + RETURN_NOT_OK(builder.Finish(&buffer)); + return buffer; + } + Status CreateBufferOf(const void* data, size_t data_length, std::shared_ptr* out) { BufferBuilder builder(pool_); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3dde41b1450e8..8dd3eb3f90c15 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -23,7 +23,6 @@ #include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -269,6 +268,9 @@ struct ValidateArrayImpl { return MapArray::ValidateChildData(data.child_data); } + Status Visit(const ListViewType& type) { return ValidateListView(type); } + Status Visit(const LargeListViewType& type) { return ValidateListView(type); } + Status Visit(const FixedSizeListType& type) { const ArrayData& values = *data.child_data[0]; const int64_t list_size = type.list_size(); @@ -582,7 +584,7 @@ struct ValidateArrayImpl { const Buffer& values = *data.buffers[2]; // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.size())); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.size())); if (data.length > 0 && data.buffers[1]->is_cpu()) { using offset_type = typename BinaryType::offset_type; @@ -702,7 +704,7 @@ struct ValidateArrayImpl { } // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length)); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.offset + values.length)); // An empty list array can have 0 offsets if (data.length > 0 && data.buffers[1]->is_cpu()) { @@ -735,6 +737,18 @@ struct ValidateArrayImpl { return Status::OK(); } + template + Status ValidateListView(const ListViewType& type) { + const ArrayData& values = *data.child_data[0]; + const Status child_valid = RecurseInto(values); + if (!child_valid.ok()) { + return Status::Invalid("List-view child array is invalid: ", + child_valid.ToString()); + } + // For list-views, sizes are validated together with offsets. + return ValidateOffsetsAndSizes(type, /*offset_limit=*/values.length); + } + template Status ValidateRunEndEncoded(const RunEndEncodedType& type) { if (data.child_data.size() != 2) { @@ -797,23 +811,105 @@ struct ValidateArrayImpl { return Status::OK(); } + private: + /// \pre basic validation has already been performed + template + Status FullyValidateOffsets(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + auto prev_offset = offsets[0]; + if (prev_offset < 0) { + return Status::Invalid("Offset invariant failure: array starts at negative offset ", + prev_offset); + } + for (int64_t i = 1; i <= data.length; ++i) { + const auto current_offset = offsets[i]; + if (current_offset < prev_offset) { + return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ", + i, ": ", current_offset, " < ", prev_offset); + } + if (current_offset > offset_limit) { + return Status::Invalid("Offset invariant failure: offset for slot ", i, + " out of bounds: ", current_offset, " > ", offset_limit); + } + prev_offset = current_offset; + } + return Status::OK(); + } + + template + Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: offset for slot ", slot, + " out of bounds. Expected ", offset, + " to be at least 0 and less than ", offset_limit); + } + + template + Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + const auto size = sizes[slot]; + if (size < 0) { + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", size, " < 0"); + } else { + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", offset, " + ", size, " > ", + offset_limit); + } + } + + /// \pre basic validation has already been performed + template + Status FullyValidateOffsetsAndSizes(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + + for (int64_t i = 0; i < data.length; ++i) { + const auto size = sizes[i]; + if (size >= 0) { + const auto offset = offsets[i]; + if (offset < 0 || offset > offset_limit) { + return OutOfBoundsListViewOffset(i, offset_limit); + } + if (size > offset_limit - offset) { + return OutOfBoundsListViewSize(i, offset_limit); + } + } else { + return OutOfBoundsListViewSize(i, offset_limit); + } + } + + return Status::OK(); + } + + public: template - Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { + Status ValidateOffsetsAndSizes(const TypeClass&, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; + constexpr bool is_list_view = is_list_view_type::value; - if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer seems accepted as a special case - // (ARROW-544) - if (data.length > 0) { - return Status::Invalid("Non-empty array but offsets are null"); + const bool non_empty = data.length > 0; + if constexpr (is_list_view) { + if (!IsBufferValid(1)) { + return Status::Invalid("offsets buffer is null"); + } + if (!IsBufferValid(2)) { + return Status::Invalid("sizes buffer is null"); + } + } else { + if (!IsBufferValid(1)) { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return non_empty ? Status::Invalid("Non-empty array but offsets are null") + : Status::OK(); } - return Status::OK(); } - // An empty list array can have 0 offsets const auto offsets_byte_size = data.buffers[1]->size(); const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0)) - ? data.length + data.offset + 1 + ? data.length + data.offset + (is_list_view ? 0 : 1) : 0; if (offsets_byte_size / static_cast(sizeof(offset_type)) < required_offsets) { @@ -821,28 +917,21 @@ struct ValidateArrayImpl { " isn't large enough for length: ", data.length, " and offset: ", data.offset); } + if constexpr (is_list_view) { + const auto required_sizes = data.length + data.offset; + const auto sizes_bytes_size = data.buffers[2]->size(); + if (sizes_bytes_size / static_cast(sizeof(offset_type)) < required_sizes) { + return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + } if (full_validation && required_offsets > 0) { - // Validate all offset values - const offset_type* offsets = data.GetValues(1); - - auto prev_offset = offsets[0]; - if (prev_offset < 0) { - return Status::Invalid( - "Offset invariant failure: array starts at negative offset ", prev_offset); - } - for (int64_t i = 1; i <= data.length; ++i) { - const auto current_offset = offsets[i]; - if (current_offset < prev_offset) { - return Status::Invalid( - "Offset invariant failure: non-monotonic offset at slot ", i, ": ", - current_offset, " < ", prev_offset); - } - if (current_offset > offset_limit) { - return Status::Invalid("Offset invariant failure: offset for slot ", i, - " out of bounds: ", current_offset, " > ", offset_limit); - } - prev_offset = current_offset; + if constexpr (is_list_view) { + return FullyValidateOffsetsAndSizes(offset_limit); + } else { + return FullyValidateOffsets(offset_limit); } } return Status::OK(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c7e6207bfefa4..7042d9818c691 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -221,6 +221,20 @@ struct MakeBuilderImpl { return Status::OK(); } + Status Visit(const ListViewType& list_view_type) { + std::shared_ptr value_type = list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new ListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + + Status Visit(const LargeListViewType& large_list_view_type) { + std::shared_ptr value_type = large_list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new LargeListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + Status Visit(const MapType& map_type) { ARROW_ASSIGN_OR_RAISE(auto key_builder, ChildBuilder(map_type.key_type())); ARROW_ASSIGN_OR_RAISE(auto item_builder, ChildBuilder(map_type.item_type())); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 033371d3d6719..eeec75f2f473d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -444,6 +444,10 @@ struct SchemaExporter { Status Visit(const LargeListType& type) { return SetFormat("+L"); } + Status Visit(const ListViewType& type) { return SetFormat("+vl"); } + + Status Visit(const LargeListViewType& type) { return SetFormat("+vL"); } + Status Visit(const FixedSizeListType& type) { return SetFormat("+w:" + ToChars(type.list_size())); } @@ -1100,6 +1104,16 @@ struct SchemaImporter { return ProcessListLike(); case 'L': return ProcessListLike(); + case 'v': { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListView(); + case 'L': + return ProcessListView(); + } + break; + } case 'w': return ProcessFixedSizeList(); case 's': @@ -1204,6 +1218,15 @@ struct SchemaImporter { return Status::OK(); } + template + Status ProcessListView() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = std::make_shared(std::move(field)); + return Status::OK(); + } + Status ProcessMap() { RETURN_NOT_OK(f_parser_.CheckAtEnd()); RETURN_NOT_OK(CheckNumChildren(1)); @@ -1572,6 +1595,10 @@ struct ArrayImporter { Status Visit(const LargeListType& type) { return ImportListLike(type); } + Status Visit(const ListViewType& type) { return ImportListView(type); } + + Status Visit(const LargeListViewType& type) { return ImportListView(type); } + Status Visit(const FixedSizeListType& type) { RETURN_NOT_OK(CheckNumChildren(1)); RETURN_NOT_OK(CheckNumBuffers(1)); @@ -1667,6 +1694,18 @@ struct ArrayImporter { return Status::OK(); } + template + Status ImportListView(const ListViewType& type) { + using offset_type = typename ListViewType::offset_type; + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK((ImportOffsetsBuffer(1))); + RETURN_NOT_OK(ImportSizesBuffer(2)); + return Status::OK(); + } + Status CheckNoChildren() { return CheckNumChildren(0); } Status CheckNumChildren(int64_t n_children) { @@ -1735,11 +1774,18 @@ struct ArrayImporter { return ImportBuffer(buffer_id, buffer_size); } - template + template Status ImportOffsetsBuffer(int32_t buffer_id) { // Compute visible size of buffer - int64_t buffer_size = - sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + + (with_extra_offset ? 1 : 0)); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportSizesBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset); return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bd0e498a9f332..362df833781a1 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,6 +33,7 @@ #include "arrow/c/util_internal.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" +#include "arrow/testing/builder.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -397,6 +398,14 @@ TEST_F(TestSchemaExport, List) { TestNested(list(large_list(int32())), {"+l", "+L", "i"}, {"", "item", "item"}); } +TEST_F(TestSchemaExport, ListView) { + TestNested(list_view(int8()), {"+vl", "c"}, {"", "item"}); + TestNested(large_list_view(uint16()), {"+vL", "S"}, {"", "item"}); + + TestNested(list_view(large_list_view(int32())), {"+vl", "+vL", "i"}, + {"", "item", "item"}); +} + TEST_F(TestSchemaExport, Struct) { auto type = struct_({field("a", int8()), field("b", utf8())}); TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, @@ -945,6 +954,33 @@ TEST_F(TestArrayExport, ListSliced) { } } +TEST_F(TestArrayExport, ListView) { + TestNested(list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(large_list_view(uint16()), "[[1, 2], [3, null], null]"); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestArrayExport, ListViewSliced) { + { + auto factory = []() { + return ArrayFromJSON(list_view(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = []() { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[5, 2, 0, 3]")->Slice(1, 2); + auto sizes = ArrayFromJSON(int32(), "[2, 3, 6, 1]")->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestArrayExport, Struct) { const char* data = R"([[1, "foo"], [2, null]])"; auto type = struct_({field("a", int8()), field("b", utf8())}); @@ -1490,6 +1526,45 @@ TEST_F(TestDeviceArrayExport, ListSliced) { } } +TEST_F(TestDeviceArrayExport, ListView) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list_view(uint16()), "[[1, 2], [3, null], null]"); + + TestNested(mm, list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListViewSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(list_view(int8()), + "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[5, 2, 0, 3]")->data()))->Slice(1, 2); + auto sizes = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[2, 3, 6, 1]")->data()))->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestDeviceArrayExport, Struct) { std::shared_ptr device = std::make_shared(1); auto mm = device->default_memory_manager(); @@ -1930,6 +2005,33 @@ TEST_F(TestSchemaImport, NestedList) { CheckImport(list(fixed_size_list(int8(), 3))); } +TEST_F(TestSchemaImport, ListView) { + FillPrimitive(AddChild(), "c"); + FillListLike("+vl"); + CheckImport(list_view(int8())); + + FillPrimitive(AddChild(), "s", "item", 0); + FillListLike("+vl"); + CheckImport(list_view(field("item", int16(), /*nullable=*/false))); + + // Large list-view + FillPrimitive(AddChild(), "s"); + FillListLike("+vL"); + CheckImport(large_list_view(int16())); +} + +TEST_F(TestSchemaImport, NestedListView) { + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+vl"); + FillListLike("+vL"); + CheckImport(large_list_view(list_view(int8()))); + + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+w:3"); + FillListLike("+vl"); + CheckImport(list_view(fixed_size_list(int8(), 3))); +} + TEST_F(TestSchemaImport, Struct) { FillPrimitive(AddChild(), "u", "strs"); FillPrimitive(AddChild(), "S", "ints"); @@ -2325,6 +2427,18 @@ static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* large_list_buffers_no_nulls1[2] = {nullptr, large_list_offsets_buffer1}; +static const int32_t list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int32_t list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* list_view_buffers_no_nulls1[3] = {nullptr, list_view_offsets_buffer1, + list_view_sizes_buffer1}; +static const void* list_view_buffers_nulls1[3] = {bits_buffer1, list_view_offsets_buffer1, + list_view_sizes_buffer1}; + +static const int64_t large_list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int64_t large_list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* large_list_view_buffers_no_nulls1[3] = { + nullptr, large_list_view_offsets_buffer1, large_list_view_sizes_buffer1}; + static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; static const void* sparse_union_buffers1_legacy[2] = {nullptr, type_codes_buffer1}; @@ -2407,6 +2521,17 @@ class TestArrayImport : public ::testing::Test { c->children = NLastChildren(1, c); } + void FillListView(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + void FillFixedSizeListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2463,6 +2588,11 @@ class TestArrayImport : public ::testing::Test { FillListLike(&c_struct_, length, null_count, offset, buffers); } + void FillListView(int64_t length, int64_t null_count, int64_t offset, + const void** buffers) { + FillListView(&c_struct_, length, null_count, offset, buffers); + } + void FillFixedSizeListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillFixedSizeListLike(&c_struct_, length, null_count, offset, buffers); @@ -2820,6 +2950,53 @@ TEST_F(TestArrayImport, ListWithOffset) { "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); } +TEST_F(TestArrayImport, ListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 1, 0, list_view_buffers_nulls1); + CheckImport( + ArrayFromJSON(list_view(int16()), "[[513, 1027], null, [1541, 2055, 2569]]")); + + // Large list-view + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list_view(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); +} + +TEST_F(TestArrayImport, NestedListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(AddChild(), 5, 0, 0, list_view_buffers_no_nulls1); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list_view(list_view(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1_8); + FillFixedSizeListLike(AddChild(), 2, 0, 0, buffers_no_nulls_no_data); + FillListView(2, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(fixed_size_list(int8(), 3)), + "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestArrayImport, ListViewWithOffset) { + // Offset in child + FillPrimitive(AddChild(), 8, 0, 1, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + // Offset in parent + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + // Both + FillPrimitive(AddChild(), 8, 0, 2, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); +} + TEST_F(TestArrayImport, Struct) { FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_16); @@ -3117,6 +3294,17 @@ TEST_F(TestArrayImport, ListError) { CheckImportError(list(int8())); } +TEST_F(TestArrayImport, ListViewNoError) { + // Unlike with lists, importing a length-0 list-view with all buffers ommitted is + // not an error. List-views don't need an extra offset value, so an empty offsets + // buffer is valid in this case. + + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(list_view(int8()), "[]")); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -3370,6 +3558,12 @@ TEST_F(TestSchemaRoundtrip, List) { TestWithTypeFactory([]() { return list(fixed_size_list(utf8(), 5)); }); } +TEST_F(TestSchemaRoundtrip, ListView) { + TestWithTypeFactory([]() { return list_view(utf8()); }); + TestWithTypeFactory([]() { return large_list_view(list_view(utf8())); }); + TestWithTypeFactory([]() { return list_view(fixed_size_list(utf8(), 5)); }); +} + TEST_F(TestSchemaRoundtrip, Struct) { auto f1 = field("f1", utf8(), /*nullable=*/false); auto f2 = field("f2", list(decimal(19, 4))); @@ -3631,6 +3825,31 @@ TEST_F(TestArrayRoundtrip, List) { TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } +TEST_F(TestArrayRoundtrip, ListView) { + TestWithJSON(list_view(int32()), "[]"); + TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list_view(int32()), "[[4, 5], [6, null], null]"); + + // Out-of-order offsets + TestWithArrayFactory([this]() -> Result> { + std::shared_ptr offsets; + ArrayFromVector(int32(), + std::vector{false, true, true, true, false, true}, + std::vector{4, 2, 1, 3, 3, 2}, &offsets); + + std::shared_ptr sizes; + ArrayFromVector(std::vector{2, 2, 3, 1, 2, 0}, &sizes); + + auto values = ArrayFromJSON(int8(), "[4, 5, 6, null, 8, null]"); + auto result = ListViewArray::FromArrays(*offsets, *sizes, *values, pool_); + if (result.ok()) { + RETURN_NOT_OK((*result)->ValidateFull()); + } + return result; + }); +} + TEST_F(TestArrayRoundtrip, Struct) { auto type = struct_({field("ints", int16()), field("bools", boolean())}); TestWithJSON(type, "[]"); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 50cfdd05a14bb..bb632e2eb912d 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -308,6 +308,10 @@ class RangeDataEqualsImpl { Status Visit(const LargeListType& type) { return CompareList(type); } + Status Visit(const ListViewType& type) { return CompareListView(type); } + + Status Visit(const LargeListViewType& type) { return CompareListView(type); } + Status Visit(const FixedSizeListType& type) { const auto list_size = type.list_size(); const ArrayData& left_data = *left_.child_data[0]; @@ -493,6 +497,38 @@ class RangeDataEqualsImpl { return Status::OK(); } + template + Status CompareListView(const TypeClass& type) { + const ArrayData& left_values = *left_.child_data[0]; + const ArrayData& right_values = *right_.child_data[0]; + + using offset_type = typename TypeClass::offset_type; + const auto* left_offsets = left_.GetValues(1) + left_start_idx_; + const auto* right_offsets = right_.GetValues(1) + right_start_idx_; + const auto* left_sizes = left_.GetValues(2) + left_start_idx_; + const auto* right_sizes = right_.GetValues(2) + right_start_idx_; + + auto compare_view = [&](int64_t i, int64_t length) -> bool { + for (int64_t j = i; j < i + length; ++j) { + if (left_sizes[j] != right_sizes[j]) { + return false; + } + const offset_type size = left_sizes[j]; + if (size == 0) { + continue; + } + RangeDataEqualsImpl impl(options_, floating_approximate_, left_values, + right_values, left_offsets[j], right_offsets[j], size); + if (!impl.Compare()) { + return false; + } + } + return true; + }; + VisitValidRuns(std::move(compare_view)); + return Status::OK(); + } + template Status CompareRunEndEncoded() { auto left_span = ArraySpan(left_); @@ -699,7 +735,8 @@ class TypeEqualsVisitor { } template - enable_if_t::value, Status> Visit(const T& left) { + enable_if_t::value || is_list_view_type::value, Status> Visit( + const T& left) { std::shared_ptr left_field = left.field(0); std::shared_ptr right_field = checked_cast(right_).field(0); bool equal_names = !check_metadata_ || (left_field->name() == right_field->name()); @@ -857,6 +894,18 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const ListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + + Status Visit(const LargeListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 6b4b2339e4afe..ee181c053c053 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -82,9 +82,9 @@ std::optional GetConstantValidityWord(const ExecValue& data) { return {}; } -// if the condition is null then output is null otherwise we take validity from the -// selected argument -// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid) +/// If the condition is null then output is null otherwise we take validity from the +/// selected argument +/// (i.e. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)). struct IfElseNullPromoter { KernelContext* ctx; const ArraySpan& cond; @@ -368,7 +368,7 @@ void RunIfElseLoopInverted(const ArraySpan& cond, const HandleBlock& handle_bloc } /// Runs if-else when cond is a scalar. Two special functions are required, -/// 1.CopyArrayData, 2. BroadcastScalar +/// 1. CopyArrayData, 2. BroadcastScalar template Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left, const ExecValue& right, ExecResult* out, @@ -1028,7 +1028,7 @@ struct NestedIfElseExec { // AAA static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1041,7 +1041,7 @@ struct NestedIfElseExec { // ASA static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1054,7 +1054,7 @@ struct NestedIfElseExec { // AAS static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1067,7 +1067,7 @@ struct NestedIfElseExec { // ASS static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1078,8 +1078,9 @@ struct NestedIfElseExec { } template - static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out, - HandleLeft&& handle_left, HandleRight&& handle_right) { + static Status RunLoopOfNestedIfElseExec(KernelContext* ctx, const ArraySpan& cond, + ExecResult* out, HandleLeft&& handle_left, + HandleRight&& handle_right) { std::unique_ptr raw_builder; RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(), &raw_builder)); @@ -1308,9 +1309,9 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : - {Type::LIST, Type::LARGE_LIST, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, + Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 34225ce9fe084..b72402bbccd4e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -67,11 +67,15 @@ struct GetBytesProcessedVisitor { } template - enable_if_var_size_list Visit(const ArrowType& type) { + enable_if_var_length_list_like Visit(const ArrowType& type) { using ArrayType = typename TypeTraits::ArrayType; using OffsetType = typename TypeTraits::OffsetType::c_type; - total_bytes += (arr->length() + 1) * sizeof(OffsetType); + const auto num_offsets = is_list_view(type) ? arr->length() : arr->length() + 1; + total_bytes += num_offsets * sizeof(OffsetType); + // NOTE: the sizes buffer is not counted when type is a list-view as that + // can make the throughput numbers look better just because the sizes + // increase the number of bytes in the input. auto child_array = internal::checked_cast(arr)->values(); return RecurseInto(child_array.get()); } @@ -126,7 +130,7 @@ static void IfElseBench(benchmark::State& state) { } template -static void IfElseBenchList(benchmark::State& state) { +static void IfElseBenchVarLengthListLike(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBench(state, list_type); @@ -172,7 +176,7 @@ static void IfElseBenchContiguous(benchmark::State& state) { } template -static void IfElseBenchListContiguous(benchmark::State& state) { +static void IfElseBenchVarLengthListLikeContiguous(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBenchContiguous(state, list_type); @@ -187,11 +191,11 @@ static void IfElseBench32(benchmark::State& state) { } static void IfElseBenchListUInt32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchListString32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchString32(benchmark::State& state) { @@ -211,11 +215,27 @@ static void IfElseBench32Contiguous(benchmark::State& state) { } static void IfElseBenchListUInt32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchListString32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewUInt32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewString32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewUInt32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewString32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchString64Contiguous(benchmark::State& state) { @@ -494,6 +514,12 @@ BENCHMARK(IfElseBenchListString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListUInt32Contiguous)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListString32Contiguous)->Args({kNumItems, 0}); +// IfElse: ListViews +BENCHMARK(IfElseBenchListViewUInt32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewUInt32Contiguous)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32Contiguous)->Args({kNumItems, 0}); + // IfElse: Strings BENCHMARK(IfElseBenchString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchString64)->Args({kNumItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index a9c5a1fc3c96f..a11aab81742ed 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -737,12 +737,15 @@ TEST_F(TestIfElseKernel, Decimal) { } } +using ListAndListViewArrowTypes = + ::testing::Types; + template -class TestIfElseList : public ::testing::Test {}; +class TestIfElseVarLengthListLike : public ::testing::Test {}; -TYPED_TEST_SUITE(TestIfElseList, ListArrowTypes); +TYPED_TEST_SUITE(TestIfElseVarLengthListLike, ListAndListViewArrowTypes); -TYPED_TEST(TestIfElseList, ListOfInt) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfInt) { auto type = std::make_shared(int32()); CheckWithDifferentShapes(ArrayFromJSON(boolean(), "[true, true, false, false]"), ArrayFromJSON(type, "[[], null, [1, null], [2, 3]]"), @@ -755,7 +758,7 @@ TYPED_TEST(TestIfElseList, ListOfInt) { ArrayFromJSON(type, "[null, null, null, null]")); } -TYPED_TEST(TestIfElseList, ListOfString) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfString) { auto type = std::make_shared(utf8()); CheckWithDifferentShapes( ArrayFromJSON(boolean(), "[true, true, false, false]"), diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 65ad70181f28a..3afe4ec85cf49 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -98,6 +98,10 @@ parquet::ReaderProperties MakeReaderProperties( parquet_scan_options->reader_properties->thrift_string_size_limit()); properties.set_thrift_container_size_limit( parquet_scan_options->reader_properties->thrift_container_size_limit()); + + properties.set_page_checksum_verification( + parquet_scan_options->reader_properties->page_checksum_verification()); + return properties; } diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index d395261597696..5d892af9a394e 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -807,6 +807,14 @@ struct ScalarToProtoImpl { return Status::OK(); } + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + Status Visit(const StructScalar& s) { lit_->set_allocated_struct_(new Lit::Struct()); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index d3fb058137e6a..f4a2e6800eb49 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -313,6 +313,10 @@ struct DataTypeToProtoImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { auto types = SetWithThen(&substrait::Type::set_allocated_struct_)->mutable_types(); diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 91e6c22255aa5..2c3d81ca24c51 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" +#include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" #include "arrow/util/formatting.h" @@ -43,7 +44,8 @@ AzureOptions::AzureOptions() {} bool AzureOptions::Equals(const AzureOptions& other) const { return (account_dfs_url == other.account_dfs_url && account_blob_url == other.account_blob_url && - credentials_kind == other.credentials_kind); + credentials_kind == other.credentials_kind && + default_metadata == other.default_metadata); } Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_name, @@ -461,6 +463,225 @@ class ObjectInputFile final : public io::RandomAccessFile { int64_t content_length_ = kNoSize; std::shared_ptr metadata_; }; + +Status CreateEmptyBlockBlob( + std::shared_ptr block_blob_client) { + try { + block_blob_client->UploadFrom(nullptr, 0); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "UploadFrom failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. There is no existing blob at this " + "location or the existing blob must be replaced so ObjectAppendStream must " + "create a new empty block blob.", + exception); + } + return Status::OK(); +} + +Result GetBlockList( + std::shared_ptr block_blob_client) { + try { + return block_blob_client->GetBlockList().Value; + } catch (Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "GetBlockList failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. Cannot write to a file without first " + "fetching the existing block list.", + exception); + } +} + +Azure::Storage::Metadata ArrowMetadataToAzureMetadata( + const std::shared_ptr& arrow_metadata) { + Azure::Storage::Metadata azure_metadata; + for (auto key_value : arrow_metadata->sorted_pairs()) { + azure_metadata[key_value.first] = key_value.second; + } + return azure_metadata; +} + +Status CommitBlockList( + std::shared_ptr block_blob_client, + const std::vector& block_ids, const Azure::Storage::Metadata& metadata) { + Azure::Storage::Blobs::CommitBlockListOptions options; + options.Metadata = metadata; + try { + // CommitBlockList puts all block_ids in the latest element. That means in the case of + // overlapping block_ids the newly staged block ids will always replace the + // previously committed blocks. + // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body + block_blob_client->CommitBlockList(block_ids, options); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "CommitBlockList failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. Committing is required to flush an " + "output/append stream.", + exception); + } + return Status::OK(); +} + +class ObjectAppendStream final : public io::OutputStream { + public: + ObjectAppendStream( + std::shared_ptr block_blob_client, + const io::IOContext& io_context, const AzureLocation& location, + const std::shared_ptr& metadata, + const AzureOptions& options, int64_t size = kNoSize) + : block_blob_client_(std::move(block_blob_client)), + io_context_(io_context), + location_(location), + content_length_(size) { + if (metadata && metadata->size() != 0) { + metadata_ = ArrowMetadataToAzureMetadata(metadata); + } else if (options.default_metadata && options.default_metadata->size() != 0) { + metadata_ = ArrowMetadataToAzureMetadata(options.default_metadata); + } + } + + ~ObjectAppendStream() override { + // For compliance with the rest of the IO stack, Close rather than Abort, + // even though it may be more expensive. + io::internal::CloseFromDestructor(this); + } + + Status Init() { + if (content_length_ != kNoSize) { + DCHECK_GE(content_length_, 0); + pos_ = content_length_; + } else { + try { + auto properties = block_blob_client_->GetProperties(); + content_length_ = properties.Value.BlobSize; + pos_ = content_length_; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client_)); + } else { + return internal::ExceptionToStatus( + "GetProperties failed for '" + block_blob_client_->GetUrl() + + "' with an unexpected Azure error. Can not initialise an " + "ObjectAppendStream without knowing whether a file already exists at " + "this path, and if it exists, its size.", + exception); + } + content_length_ = 0; + } + } + if (content_length_ > 0) { + ARROW_ASSIGN_OR_RAISE(auto block_list, GetBlockList(block_blob_client_)); + for (auto block : block_list.CommittedBlocks) { + block_ids_.push_back(block.Name); + } + } + return Status::OK(); + } + + Status Abort() override { + if (closed_) { + return Status::OK(); + } + block_blob_client_ = nullptr; + closed_ = true; + return Status::OK(); + } + + Status Close() override { + if (closed_) { + return Status::OK(); + } + RETURN_NOT_OK(Flush()); + block_blob_client_ = nullptr; + closed_ = true; + return Status::OK(); + } + + bool closed() const override { return closed_; } + + Status CheckClosed(const char* action) const { + if (closed_) { + return Status::Invalid("Cannot ", action, " on closed stream."); + } + return Status::OK(); + } + + Result Tell() const override { + RETURN_NOT_OK(CheckClosed("tell")); + return pos_; + } + + Status Write(const std::shared_ptr& buffer) override { + return DoAppend(buffer->data(), buffer->size(), buffer); + } + + Status Write(const void* data, int64_t nbytes) override { + return DoAppend(data, nbytes); + } + + Status Flush() override { + RETURN_NOT_OK(CheckClosed("flush")); + return CommitBlockList(block_blob_client_, block_ids_, metadata_); + } + + private: + Status DoAppend(const void* data, int64_t nbytes, + std::shared_ptr owned_buffer = nullptr) { + RETURN_NOT_OK(CheckClosed("append")); + auto append_data = reinterpret_cast(data); + Azure::Core::IO::MemoryBodyStream block_content(append_data, nbytes); + if (block_content.Length() == 0) { + return Status::OK(); + } + + const auto n_block_ids = block_ids_.size(); + + // New block ID must always be distinct from the existing block IDs. Otherwise we + // will accidentally replace the content of existing blocks, causing corruption. + // We will use monotonically increasing integers. + auto new_block_id = std::to_string(n_block_ids); + + // Pad to 5 digits, because Azure allows a maximum of 50,000 blocks. + const size_t target_number_of_digits = 5; + const auto required_padding_digits = + target_number_of_digits - std::min(target_number_of_digits, new_block_id.size()); + new_block_id.insert(0, required_padding_digits, '0'); + // There is a small risk when appending to a blob created by another client that + // `new_block_id` may overlapping with an existing block id. Adding the `-arrow` + // suffix significantly reduces the risk, but does not 100% eliminate it. For example + // if the blob was previously created with one block, with id `00001-arrow` then the + // next block we append will conflict with that, and cause corruption. + new_block_id += "-arrow"; + new_block_id = Azure::Core::Convert::Base64Encode( + std::vector(new_block_id.begin(), new_block_id.end())); + + try { + block_blob_client_->StageBlock(new_block_id, block_content); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + + new_block_id + + "' with an unexpected Azure error. Staging new blocks is fundamental to " + "streaming writes to blob storage.", + exception); + } + block_ids_.push_back(new_block_id); + pos_ += nbytes; + content_length_ += nbytes; + return Status::OK(); + } + + std::shared_ptr block_blob_client_; + const io::IOContext io_context_; + const AzureLocation location_; + + bool closed_ = false; + int64_t pos_ = 0; + int64_t content_length_ = kNoSize; + std::vector block_ids_; + Azure::Storage::Metadata metadata_; +}; + } // namespace // ----------------------------------------------------------------------- @@ -708,20 +929,46 @@ class AzureFileSystem::Impl { return Status::OK(); } - auto directory_client = - datalake_service_client_->GetFileSystemClient(location.container) - .GetDirectoryClient(location.path); - try { - directory_client.CreateIfNotExists(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to create a directory: " + location.path + " (" + - directory_client.GetUrl() + ")", - exception); + if (!location.path.empty()) { + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); + try { + directory_client.CreateIfNotExists(); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to create a directory: " + location.path + " (" + + directory_client.GetUrl() + ")", + exception); + } } return Status::OK(); } + + Result> OpenAppendStream( + const AzureLocation& location, + const std::shared_ptr& metadata, const bool truncate, + AzureFileSystem* fs) { + RETURN_NOT_OK(ValidateFileLocation(location)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(location.path)); + + auto block_blob_client = std::make_shared( + blob_service_client_->GetBlobContainerClient(location.container) + .GetBlockBlobClient(location.path)); + + std::shared_ptr stream; + if (truncate) { + RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client)); + stream = std::make_shared(block_blob_client, fs->io_context(), + location, metadata, options_, 0); + } else { + stream = std::make_shared(block_blob_client, fs->io_context(), + location, metadata, options_); + } + RETURN_NOT_OK(stream->Init()); + return stream; + } }; const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } @@ -803,12 +1050,14 @@ Result> AzureFileSystem::OpenInputFile( Result> AzureFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenAppendStream(location, metadata, true, this); } Result> AzureFileSystem::OpenAppendStream( - const std::string&, const std::shared_ptr&) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + const std::string& path, const std::shared_ptr& metadata) { + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenAppendStream(location, metadata, false, this); } Result> AzureFileSystem::Make( diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 1f7047ff94c56..9f980ee8baae0 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -77,6 +77,11 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr service_principle_credentials_provider; + /// \brief Default metadata for OpenOutputStream. + /// + /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + std::shared_ptr default_metadata; + AzureOptions(); Status ConfigureAccountKeyCredentials(const std::string& account_name, diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf0a19f684eb..e9b9a6f34b88c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -232,13 +232,11 @@ class AzureFileSystemTest : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - auto blob_client = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); - std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); - blob_client.UploadFrom(reinterpret_cast(all_lines.data()), - total_size); + const auto path = PreexistingContainerPath() + path_to_file; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); + ASSERT_OK(output->Write(all_lines)); + ASSERT_OK(output->Close()); } void RunGetFileInfoObjectWithNestedStructureTest(); @@ -347,21 +345,26 @@ void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(kObjectName) - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + ASSERT_OK_AND_ASSIGN( + auto output, + fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName, /*metadata=*/{})); + const std::string_view data(kLoremIpsum); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient("test-object-dir/some_other_dir0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); - - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(std::string(kObjectName) + "0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + ASSERT_OK_AND_ASSIGN(output, + fs_->OpenOutputStream( + PreexistingContainerPath() + "test-object-dir/some_other_dir0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN( + output, fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName + "0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", @@ -647,6 +650,157 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } +TEST_F(AzuriteFileSystemTest, TestWriteMetadata) { + options_.default_metadata = arrow::key_value_metadata({{"foo", "bar"}}); + + ASSERT_OK_AND_ASSIGN(auto fs_with_defaults, AzureFileSystem::Make(options_)); + std::string path = "object_with_defaults"; + auto location = PreexistingContainerPath() + path; + ASSERT_OK_AND_ASSIGN(auto output, + fs_with_defaults->OpenOutputStream(location, /*metadata=*/{})); + const std::string_view expected(kLoremIpsum); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + + // Verify the metadata has been set. + auto blob_metadata = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path) + .GetProperties() + .Value.Metadata; + EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("foo", "bar")}, blob_metadata); + + // Check that explicit metadata overrides the defaults. + ASSERT_OK_AND_ASSIGN( + output, fs_with_defaults->OpenOutputStream( + location, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + blob_metadata = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path) + .GetProperties() + .Value.Metadata; + // Defaults are overwritten and not merged. + EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("bar", "foo")}, blob_metadata); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamSmall) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected(kLoremIpsum); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + + // Verify we can read the object back. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + + EXPECT_EQ(expected, std::string_view(inbuf.data(), size)); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamLarge) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; + std::array buffers{ + std::string(sizes[0], 'A'), + std::string(sizes[1], 'B'), + std::string(sizes[2], 'C'), + }; + auto expected = std::int64_t{0}; + for (auto i = 0; i != 3; ++i) { + ASSERT_OK(output->Write(buffers[i])); + expected += sizes[i]; + ASSERT_EQ(expected, output->Tell()); + } + ASSERT_OK(output->Close()); + + // Verify we can read the object back. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::string contents; + std::shared_ptr buffer; + do { + ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024)); + ASSERT_TRUE(buffer); + contents.append(buffer->ToString()); + } while (buffer->size() != 0); + + EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamTruncatesExistingFile) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected0("Existing blob content"); + ASSERT_OK(output->Write(expected0)); + ASSERT_OK(output->Close()); + + // Check that the initial content has been written - if not this test is not achieving + // what it's meant to. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); + + ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); + const std::string_view expected1(kLoremIpsum); + ASSERT_OK(output->Write(expected1)); + ASSERT_OK(output->Close()); + + // Verify that the initial content has been overwritten. + ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); +} + +TEST_F(AzuriteFileSystemTest, OpenAppendStreamDoesNotTruncateExistingFile) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected0("Existing blob content"); + ASSERT_OK(output->Write(expected0)); + ASSERT_OK(output->Close()); + + // Check that the initial content has been written - if not this test is not achieving + // what it's meant to. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected0, std::string_view(inbuf.data())); + + ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); + const std::string_view expected1(kLoremIpsum); + ASSERT_OK(output->Write(expected1)); + ASSERT_OK(output->Close()); + + // Verify that the initial content has not been overwritten and that the block from + // the other client was not committed. + ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(std::string(inbuf.data(), size), + std::string(expected0) + std::string(expected1)); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamClosed) { + const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), + "open-output-stream-closed.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK(output->Close()); + ASSERT_RAISES(Invalid, output->Write(kLoremIpsum, std::strlen(kLoremIpsum))); + ASSERT_RAISES(Invalid, output->Flush()); + ASSERT_RAISES(Invalid, output->Tell()); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamUri) { + const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), + "open-output-stream-uri.txt"); + ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); +} + TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index e023e6a3a44d3..9b56928c68843 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -793,8 +793,6 @@ void CheckPrimitive(const std::shared_ptr& type, } TEST(TestJsonSchemaWriter, FlatTypes) { - // TODO - // field("f14", date32()) std::vector> fields = { field("f0", int8()), field("f1", int16(), false), @@ -822,6 +820,8 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f21", run_end_encoded(int16(), utf8())), field("f22", run_end_encoded(int32(), utf8())), field("f23", run_end_encoded(int64(), utf8())), + field("f24", list_view(int32())), + field("f25", large_list_view(uint8())), }; auto schema = ::arrow::schema(fields); @@ -1147,10 +1147,12 @@ TEST_P(TestJsonRoundTrip, RoundTrip) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 59749c36a958e..64eb342d5bd47 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -236,7 +236,7 @@ class SchemaWriter { enable_if_t::value || is_primitive_ctype::value || is_base_binary_type::value || is_binary_view_like_type::value || is_var_length_list_type::value || is_struct_type::value || - is_run_end_encoded_type::value> + is_run_end_encoded_type::value || is_list_view_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -422,6 +422,16 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const ListViewType& type) { + WriteName("listview", type); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + WriteName("largelistview", type); + return Status::OK(); + } + Status Visit(const MapType& type) { WriteName("map", type); return Status::OK(); @@ -777,6 +787,15 @@ class ArrayWriter { return WriteChildren(array.type()->fields(), {array.values()}); } + template + enable_if_list_view Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); + WriteIntegerField("SIZE", array.raw_value_sizes(), array.length()); + return WriteChildren(array.type()->fields(), {array.values()}); + } + Status Visit(const FixedSizeListArray& array) { WriteValidityField(array); const auto& type = checked_cast(*array.type()); @@ -1132,6 +1151,16 @@ Result> GetType(const RjObject& json_type, return Status::Invalid("Large list must have exactly one child"); } return large_list(children[0]); + } else if (type_name == "listview") { + if (children.size() != 1) { + return Status::Invalid("List-view must have exactly one child"); + } + return list_view(children[0]); + } else if (type_name == "largelistview") { + if (children.size() != 1) { + return Status::Invalid("Large list-view must have exactly one child"); + } + return large_list_view(children[0]); } else if (type_name == "map") { return GetMap(json_type, children); } else if (type_name == "fixedsizelist") { @@ -1651,6 +1680,26 @@ class ArrayReader { return CreateList(type_); } + template + Status CreateListView(const std::shared_ptr& type) { + using offset_type = typename T::offset_type; + + RETURN_NOT_OK(InitializeData(3)); + + RETURN_NOT_OK(GetNullBitmap()); + ARROW_ASSIGN_OR_RAISE(const auto json_offsets, GetMemberArray(obj_, "OFFSET")); + RETURN_NOT_OK(GetIntArray(json_offsets, length_, &data_->buffers[1])); + ARROW_ASSIGN_OR_RAISE(const auto json_sizes, GetMemberArray(obj_, "SIZE")); + RETURN_NOT_OK(GetIntArray(json_sizes, length_, &data_->buffers[2])); + RETURN_NOT_OK(GetChildren(obj_, *type)); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + return CreateListView(type_); + } + Status Visit(const MapType& type) { auto list_type = std::make_shared(type.value_field()); RETURN_NOT_OK(CreateList(list_type)); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index d13e0714cbf5a..5b760a2b5a9cf 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -159,6 +159,8 @@ class ARROW_EXPORT BufferReader BufferReader(const uint8_t* data, int64_t size); /// \brief Instantiate from std::string_view. Does not own data + /// \deprecated Deprecated in 14.0.0. Use FromString or + /// BufferReader(std::shared_ptr buffer) instead. ARROW_DEPRECATED( "Deprecated in 14.0.0. Use FromString or BufferReader(std::shared_ptr " "buffer) instead.") diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 0b6ae4f620647..80e441fe2b670 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -329,9 +329,11 @@ namespace { const std::vector kBatchCases = { &ipc::test::MakeIntRecordBatch, &ipc::test::MakeListRecordBatch, + &ipc::test::MakeListViewRecordBatch, &ipc::test::MakeFixedSizeListRecordBatch, &ipc::test::MakeNonNullRecordBatch, &ipc::test::MakeDeeplyNestedList, + &ipc::test::MakeDeeplyNestedListView, &ipc::test::MakeStringTypesRecordBatchWithNulls, &ipc::test::MakeStruct, &ipc::test::MakeUnion, diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 50be10991ff9f..682c352132a11 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -74,6 +74,8 @@ Result>> Batches() { batches.push_back(batch); RETURN_NOT_OK(test::MakeListRecordBatch(&batch)); batches.push_back(batch); + RETURN_NOT_OK(test::MakeListViewRecordBatch(&batch)); + batches.push_back(batch); RETURN_NOT_OK(test::MakeDictionary(&batch)); batches.push_back(batch); RETURN_NOT_OK(test::MakeTimestamps(&batch)); diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 4d2d803f3f65e..ceeabe01677ed 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -123,12 +123,16 @@ Status GetConverter(const std::shared_ptr&, std::shared_ptr template class ConcreteConverter : public Converter { public: - Status AppendValues(const rj::Value& json_array) override { - auto self = static_cast(this); - if (!json_array.IsArray()) { - return JSONTypeError("array", json_array.GetType()); + Result SizeOfJSONArray(const rj::Value& json_obj) { + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); } - auto size = json_array.Size(); + return json_obj.Size(); + } + + Status AppendValues(const rj::Value& json_array) final { + auto self = static_cast(this); + ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(self->AppendValue(json_array[i])); } @@ -536,15 +540,19 @@ class FixedSizeBinaryConverter final // Converter for list arrays template -class ListConverter final : public ConcreteConverter> { +class VarLengthListLikeConverter final + : public ConcreteConverter> { public: using BuilderType = typename TypeTraits::BuilderType; - explicit ListConverter(const std::shared_ptr& type) { this->type_ = type; } + explicit VarLengthListLikeConverter(const std::shared_ptr& type) { + this->type_ = type; + } Status Init() override { - const auto& list_type = checked_cast(*this->type_); - RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + const auto& var_length_list_like_type = checked_cast(*this->type_); + RETURN_NOT_OK( + GetConverter(var_length_list_like_type.value_type(), &child_converter_)); auto child_builder = child_converter_->builder(); builder_ = std::make_shared(default_memory_pool(), child_builder, this->type_); @@ -555,8 +563,9 @@ class ListConverter final : public ConcreteConverter> { if (json_obj.IsNull()) { return this->AppendNull(); } - RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array + ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); + RETURN_NOT_OK(builder_->Append(true, size)); return child_converter_->AppendValues(json_obj); } @@ -898,8 +907,11 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, + VarLengthListLikeConverter) SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b67c26999945b..ea3a9ae1a14a9 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -59,6 +59,9 @@ using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; +using ListAndListViewTypes = + ::testing::Types; + // Avoid undefined behaviour on signed overflow template Signed SafeSignedAdd(Signed u, Signed v) { @@ -591,145 +594,207 @@ TEST(TestDecimal, Dictionary) { } } -TEST(TestList, IntegerList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(int64()); - std::shared_ptr offsets, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - ArrayFromVector({}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 2, 2, 3}, &offsets); - ArrayFromVector({4, 5, 6}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); +template +class TestVarLengthListArray : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + + static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); + + void TestIntegerList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 0, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestList, IntegerListErrors) { - std::shared_ptr type = list(int64()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); -} - -TEST(TestList, NullList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(null()); - std::shared_ptr offsets, values, expected, actual; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - values = std::make_shared(0); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + void TestIntegerListErrors() { + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr array; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - values = std::make_shared(3); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); + } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); + void TestNullList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(null()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} -TEST(TestList, IntegerListList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(list(uint8())); - std::shared_ptr offsets, values, nested, expected, actual; + void TestIntegerListList() { + auto pool = default_memory_pool(); + std::shared_ptr type = + std::make_shared(std::make_shared(uint8())); + std::shared_ptr offsets, sizes, values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({1, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 2, 3}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 0, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 3, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.Append(true, 0)); + ASSERT_OK(list_builder.Finish(&expected)); + } + } +}; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 2, 3}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 2); - AssertArraysEqual(*expected, *actual); +TYPED_TEST_SUITE(TestVarLengthListArray, ListAndListViewTypes); - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 4); - AssertArraysEqual(*expected, *actual); +TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto& child_builder = checked_cast(*list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.Append()); - ASSERT_OK(list_builder.Finish(&expected)); - } -} +TYPED_TEST(TestVarLengthListArray, IntegerListErrors) { this->TestIntegerListErrors(); } -TEST(TestLargeList, Basics) { - // Similar as TestList above, only testing the basics - auto pool = default_memory_pool(); - std::shared_ptr type = large_list(int16()); - std::shared_ptr offsets, values, expected, actual; +TYPED_TEST(TestVarLengthListArray, NullList) { this->TestNullList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, LargeListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); -} +TYPED_TEST(TestVarLengthListArray, IntegerListList) { this->TestIntegerListList(); } TEST(TestMap, IntegerToInteger) { auto type = map(int16(), int16()); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index ab1a58dd1df8b..4f41edf8e15db 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -361,6 +361,18 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type::ListView: + if (children.size() != 1) { + return Status::Invalid("ListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type::LargeListView: + if (children.size() != 1) { + return Status::Invalid("LargeListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type::Map: if (children.size() != 1) { return Status::Invalid("Map must have exactly 1 child field"); @@ -669,6 +681,20 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const ListViewType& type) { + fb_type_ = flatbuf::Type::ListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + fb_type_ = flatbuf::Type::LargeListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + Status Visit(const MapType& type) { fb_type_ = flatbuf::Type::Map; RETURN_NOT_OK(VisitChildFields(type)); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 313346b5deced..5c15cb912e4a7 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -376,10 +376,12 @@ TEST_F(TestSchemaMetadata, MetadataVersionForwardCompatibility) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, @@ -974,6 +976,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeListRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeListViewRecordBatch(&batch)); + TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeZeroLengthRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); @@ -982,6 +987,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeDeeplyNestedList(&batch)); TestGetRecordBatchSize(options_, batch); + + ASSERT_OK(MakeDeeplyNestedListView(&batch)); + TestGetRecordBatchSize(options_, batch); } class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 5dd01f2015dd7..2ea2a4bd125c2 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -330,6 +330,22 @@ class ArrayLoader { return LoadChildren(type.fields()); } + template + Status LoadListView(const TYPE& type) { + out_->buffers.resize(3); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[2])); + + const int num_children = type.num_fields(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.fields()); + } + Status LoadChildren(const std::vector>& child_fields) { DCHECK_NE(out_, nullptr); ArrayData* parent = out_; @@ -392,6 +408,11 @@ class ArrayLoader { return LoadList(type); } + template + enable_if_list_view Visit(const T& type) { + return LoadListView(type); + } + Status Visit(const MapType& type) { RETURN_NOT_OK(LoadList(type)); return MapArray::ValidateChildData(out_->child_data); diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 6faaf96b332d4..87c02e2d87a1e 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -189,6 +189,32 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li return MakeListArray(child_array, num_lists, include_nulls, pool, out); } +Status MakeRandomListViewArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.ListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, /*coverage=*/0.9, + kDefaultBufferAlignment, pool); + return Status::OK(); +} + +Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, + int num_lists, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.LargeListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, + /*force_empty_nulls=*/0.9, kDefaultBufferAlignment, pool); + return Status::OK(); +} + Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { @@ -418,6 +444,31 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeListViewRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = field("f0", list_view(int32())); + auto f1 = field("f1", list_view(list_view(int32()))); + auto f2 = field("f2", large_list_view(int32())); + auto schema = ::arrow::schema({f0, f1, f2}); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListViewArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListViewArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomLargeListViewArray(leaf_values, length, include_nulls, pool, + &large_list_array)); + *out = + RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array}); + return Status::OK(); +} + Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", fixed_size_list(int32(), 1)); @@ -505,6 +556,27 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } +Status MakeDeeplyNestedListView(std::shared_ptr* out) { + const int batch_length = 5; + auto type = int32(); + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(list_view(type)); + RETURN_NOT_OK( + MakeRandomListViewArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = field("f0", type); + auto schema = ::arrow::schema({f0}); + std::vector> arrays = {array}; + *out = RecordBatch::Make(schema, batch_length, arrays); + return Status::OK(); +} + Status MakeStruct(std::shared_ptr* out) { // reuse constructed list columns std::shared_ptr list_batch; diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index fc0c8ddbea319..db8613cbb1e6a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -107,6 +107,9 @@ Status MakeNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeListRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatch(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); @@ -119,6 +122,9 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeDeeplyNestedList(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedListView(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeStruct(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 9668f459d0d31..93256440f4a7a 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -350,6 +350,67 @@ class RecordBatchSerializer { return Status::OK(); } + template + Status GetZeroBasedListViewOffsets(const ArrayType& array, + std::shared_ptr* out_value_offsets, + offset_type* out_min_offset, + offset_type* out_max_end) { + auto offsets = array.value_offsets(); + auto sizes = array.value_sizes(); + + const int64_t required_bytes = sizeof(offset_type) * array.length(); + if (array.offset() != 0) { + // If we have a non-zero offset, it's likely that the smallest offset is + // not zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly. + + ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, + AllocateBuffer(required_bytes, options_.memory_pool)); + offset_type min_offset = 0; + offset_type max_end = 0; + if (array.length() > 0) { + min_offset = std::numeric_limits::max(); + for (int i = 0; i < array.length(); ++i) { + min_offset = std::min(min_offset, array.value_offset(i)); + max_end = std::max(max_end, array.value_offset(i) + array.value_length(i)); + } + } + + auto* dest_offsets = shifted_offsets->mutable_data_as(); + + for (int i = 0; i < array.length(); ++i) { + dest_offsets[i] = array.value_offset(i) - min_offset; + } + *out_min_offset = min_offset; + *out_max_end = max_end; + offsets = std::move(shifted_offsets); + } else { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated + // slice + if (offsets != nullptr && offsets->size() > required_bytes) { + offsets = SliceBuffer(offsets, 0, required_bytes); + } + *out_min_offset = 0; + *out_max_end = static_cast(array.values()->length()); + } + *out_value_offsets = std::move(offsets); + return Status::OK(); + } + + template + Status GetListViewSizes(const ArrayType& array, + std::shared_ptr* out_value_sizes) { + const int64_t required_bytes = sizeof(offset_type) * array.length(); + auto sizes = array.value_sizes(); + if (sizes != nullptr && (array.offset() != 0 || sizes->size() > required_bytes)) { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated slice + auto offset_bytes = array.offset() * sizeof(offset_type); + sizes = SliceBuffer(sizes, offset_bytes, required_bytes); + } + *out_value_sizes = std::move(sizes); + return Status::OK(); + } + Status Visit(const BooleanArray& array) { std::shared_ptr data; RETURN_NOT_OK(GetTruncatedBitmap(array.offset(), array.length(), array.values(), @@ -428,7 +489,6 @@ class RecordBatchSerializer { RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); out_->body_buffers.emplace_back(value_offsets); - --max_recursion_depth_; std::shared_ptr values = array.values(); offset_type values_offset = 0; @@ -442,6 +502,37 @@ class RecordBatchSerializer { // Must also slice the values values = values->Slice(values_offset, values_length); } + --max_recursion_depth_; + RETURN_NOT_OK(VisitArray(*values)); + ++max_recursion_depth_; + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& array) { + using offset_type = typename T::offset_type; + + offset_type min_offset = 0; + offset_type max_end = 0; + { + std::shared_ptr value_offsets; + RETURN_NOT_OK( + GetZeroBasedListViewOffsets(array, &value_offsets, &min_offset, &max_end)); + out_->body_buffers.push_back(std::move(value_offsets)); + } + { + std::shared_ptr value_sizes; + RETURN_NOT_OK(GetListViewSizes(array, &value_sizes)); + out_->body_buffers.push_back(std::move(value_sizes)); + } + + std::shared_ptr values = array.values(); + + if (min_offset != 0 || max_end < values->length()) { + // Must also slice the values + values = values->Slice(min_offset, max_end); + } + --max_recursion_depth_; RETURN_NOT_OK(VisitArray(*values)); ++max_recursion_depth_; return Status::OK(); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index f7ab6fd10275f..2f819779bdb59 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -135,6 +135,10 @@ struct GenerateImpl { return OK(writer.EndArray(size)); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); } Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index b392e027a6b89..e666ec70f9489 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -249,7 +249,8 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_list_like WriteDataValues(const ArrayType& array) { + enable_if_t::value || is_list_view_type::value, Status> + WriteDataValues(const ArrayType& array) { const auto values = array.values(); const auto child_options = ChildOptions(); ArrayPrinter values_printer(child_options, sink_); @@ -300,6 +301,8 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, Status> diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 9217e190d5b62..0db6ae4867299 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -774,8 +774,11 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) { CheckPrimitive(options, is_valid, values, expected, false); } -TEST_F(TestPrettyPrint, ListType) { - auto list_type = list(int64()); +template +void TestPrettyPrintVarLengthListLike() { + using LargeTypeClass = typename TypeTraits::LargeType; + auto var_list_type = std::make_shared(int64()); + auto var_large_list_type = std::make_shared(int64()); static const char* ex = R"expected([ [ @@ -836,7 +839,7 @@ TEST_F(TestPrettyPrint, ListType) { ] ])expected"; - auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + auto array = ArrayFromJSON(var_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); auto make_options = [](int indent, int window, int container_window) { auto options = PrettyPrintOptions(indent, window); options.container_window = container_window; @@ -850,8 +853,7 @@ TEST_F(TestPrettyPrint, ListType) { ex_3); CheckArray(*array, {0, 10}, ex_4); - list_type = large_list(int64()); - array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + array = ArrayFromJSON(var_large_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); CheckStream(*array, make_options(/*indent=*/0, /*window=*/10, /*container_window=*/5), ex); CheckStream(*array, make_options(/*indent=*/2, /*window=*/10, /*container_window=*/5), @@ -861,6 +863,93 @@ TEST_F(TestPrettyPrint, ListType) { CheckArray(*array, {0, 10}, ex_4); } +TEST_F(TestPrettyPrint, ListType) { TestPrettyPrintVarLengthListLike(); } + +template +void TestListViewSpecificPrettyPrinting() { + using ArrayType = typename TypeTraits::ArrayType; + using OffsetType = typename TypeTraits::OffsetType; + + auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + auto int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + auto int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + + auto Offsets = [](std::string_view json) { + return ArrayFromJSON(TypeTraits::type_singleton(), json); + }; + auto Sizes = Offsets; + + ASSERT_OK_AND_ASSIGN(auto int_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *int32_values)); + ASSERT_OK(int_list_view_array->ValidateFull()); + static const char* ex1 = + "[\n" + " [\n" + " 1,\n" + " 20\n" + " ],\n" + " [\n" + " 1\n" + " ],\n" + " [\n" + " 20\n" + " ],\n" + " [\n" + " 3\n" + " ]\n" + "]"; + CheckStream(*int_list_view_array, {}, ex1); + + ASSERT_OK_AND_ASSIGN(auto string_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + ASSERT_OK(string_list_view_array->ValidateFull()); + static const char* ex2 = + "[\n" + " [\n" + " \"Hello\",\n" + " \"World\"\n" + " ],\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ],\n" + " [\n" + " null\n" + " ]\n" + "]"; + CheckStream(*string_list_view_array, {}, ex2); + + auto sliced_array = string_list_view_array->Slice(1, 2); + static const char* ex3 = + "[\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ]\n" + "]"; + CheckStream(*sliced_array, {}, ex3); + + ASSERT_OK_AND_ASSIGN( + auto empty_array, + ArrayType::FromArrays(*Offsets("[]"), *Sizes("[]"), *int16_values)); + ASSERT_OK(empty_array->ValidateFull()); + static const char* ex4 = "[]"; + CheckStream(*empty_array, {}, ex4); +} + +TEST_F(TestPrettyPrint, ListViewType) { + TestPrettyPrintVarLengthListLike(); + + TestListViewSpecificPrettyPrinting(); + TestListViewSpecificPrettyPrinting(); +} + TEST_F(TestPrettyPrint, ListTypeNoNewlines) { auto list_type = list(int64()); auto empty_array = ArrayFromJSON(list_type, "[]"); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 167e272705268..6996b46c8b61a 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -587,6 +587,12 @@ ListScalar::ListScalar(std::shared_ptr value, bool is_valid) LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list(value->type()), is_valid) {} +ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, list_view(value->type()), is_valid) {} + +LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); @@ -776,14 +782,6 @@ struct MakeNullImpl { return Status::OK(); } - template ::ScalarType> - Status VisitListLike(const T& type, int64_t value_size = 0) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, - MakeArrayOfNull(type.value_type(), value_size)); - out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); - return Status::OK(); - } - Status Visit(const FixedSizeBinaryType& type) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, AllocateBuffer(type.byte_width())); @@ -794,11 +792,25 @@ struct MakeNullImpl { return Status::OK(); } + template ::ScalarType> + Status VisitListLike(const T& type, int64_t list_size = 0) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, + MakeArrayOfNull(type.value_type(), list_size)); + out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); + return Status::OK(); + } + Status Visit(const ListType& type) { return VisitListLike(type); } + Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const MapType& type) { return VisitListLike(type); } - Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const ListViewType& type) { return VisitListLike(type); } + + Status Visit(const LargeListViewType& type) { + return VisitListLike(type); + } Status Visit(const FixedSizeListType& type) { return VisitListLike(type, type.list_size()); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 5175b0128524c..65c5ee4df0a04 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -531,6 +531,20 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar { explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); }; +struct ARROW_EXPORT ListViewScalar : public BaseListScalar { + using TypeClass = ListViewType; + using BaseListScalar::BaseListScalar; + + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + +struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar { + using TypeClass = LargeListViewType; + using BaseListScalar::BaseListScalar; + + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + struct ARROW_EXPORT MapScalar : public BaseListScalar { using TypeClass = MapType; using BaseListScalar::BaseListScalar; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index a188aea1669a4..cba817f67b1a9 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -394,6 +394,10 @@ class TestRealScalar : public ::testing::Test { void TestLargeListOf() { TestListOf(large_list(type_)); } + void TestListViewOf() { TestListOf(list_view(type_)); } + + void TestLargeListViewOf() { TestListOf(large_list_view(type_)); } + protected: std::shared_ptr type_; std::shared_ptr scalar_val_, scalar_other_, scalar_nan_, scalar_other_nan_, @@ -414,6 +418,10 @@ TYPED_TEST(TestRealScalar, ListOf) { this->TestListOf(); } TYPED_TEST(TestRealScalar, LargeListOf) { this->TestLargeListOf(); } +TYPED_TEST(TestRealScalar, ListViewOf) { this->TestListViewOf(); } + +TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); } + template class TestDecimalScalar : public ::testing::Test { public: @@ -1083,7 +1091,7 @@ void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& } template -class TestListScalar : public ::testing::Test { +class TestListLikeScalar : public ::testing::Test { public: using ScalarType = typename TypeTraits::ScalarType; @@ -1177,17 +1185,18 @@ class TestListScalar : public ::testing::Test { std::shared_ptr value_; }; -using ListScalarTestTypes = ::testing::Types; +using ListScalarTestTypes = ::testing::Types; -TYPED_TEST_SUITE(TestListScalar, ListScalarTestTypes); +TYPED_TEST_SUITE(TestListLikeScalar, ListScalarTestTypes); -TYPED_TEST(TestListScalar, Basics) { this->TestBasics(); } +TYPED_TEST(TestListLikeScalar, Basics) { this->TestBasics(); } -TYPED_TEST(TestListScalar, ValidateErrors) { this->TestValidateErrors(); } +TYPED_TEST(TestListLikeScalar, ValidateErrors) { this->TestValidateErrors(); } -TYPED_TEST(TestListScalar, Hashing) { this->TestHashing(); } +TYPED_TEST(TestListLikeScalar, Hashing) { this->TestHashing(); } -TYPED_TEST(TestListScalar, Cast) { this->TestCast(); } +TYPED_TEST(TestListLikeScalar, Cast) { this->TestCast(); } TEST(TestFixedSizeListScalar, ValidateErrors) { const auto ty = fixed_size_list(int16(), 3); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1386075397e20..c317fe7aef44c 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -499,6 +499,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { + template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename OffsetArrayType::value_type first_offset, @@ -608,6 +609,205 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ", + ArrowType::type_name())); + } + return output; +} + +/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm [1]. +/// +/// [1] https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] data The array to shuffle +template +void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { + DCHECK_EQ(data->type->id(), ListViewType::type_id); + using offset_type = typename ListViewType::offset_type; + + auto* validity = data->GetMutableValues(0, 0); + auto* offsets = data->GetMutableValues(1); + auto* sizes = data->GetMutableValues(2); + + pcg32_fast rng(seed); + using UniformDist = std::uniform_int_distribution; + UniformDist dist; + for (int64_t i = data->length - 1; i > 0; --i) { + const auto j = dist(rng, UniformDist::param_type(0, i)); + if (ARROW_PREDICT_TRUE(i != j)) { + // Swap validity bits + if (validity) { + const bool valid_i = bit_util::GetBit(validity, data->offset + i); + const bool valid_j = bit_util::GetBit(validity, data->offset + i); + if (valid_i != valid_j) { + bit_util::SetBitTo(validity, data->offset + i, valid_j); + bit_util::SetBitTo(validity, data->offset + j, valid_i); + } + } + // Swap offsets and sizes + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); + } + } +} + +/// \brief Generate the list-view offsets based on a random buffer of sizes. +/// +/// The sizes buffer is an input of this function, but when force_empty_nulls is true, +/// some values on the sizes buffer can be set to 0. +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] mutable_sizes_array The array of sizes to use +/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0 +/// \param[in] zero_undefined_offsets Whether to zero the offsets of list-views that have +/// 0 set as the size +/// \param[out] out_max_view_end The maximum value of the end of a list-view +template +std::shared_ptr ViewOffsetsFromLengthsArray( + SeedType seed, OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t* out_max_view_end, int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename OffsetArrayType::TypeClass; + + auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); + + BufferVector buffers{2}; + buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to + buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(), + alignment, memory_pool); + auto offsets = buffers[1]->mutable_data_as(); + + offset_type offset = 0; + offset_type max_view_end = 0; + for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { + if (mutable_sizes_array.IsNull(i)) { + if (force_empty_nulls) { + sizes[i] = 0; + } + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + if (sizes[i] == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + offsets[i] = offset; + DCHECK_LT(offset, std::numeric_limits::max() - sizes[i]); + offset += sizes[i]; + } + } + max_view_end = std::max(max_view_end, offsets[i] + sizes[i]); + } + *out_max_view_end = max_view_end; + + auto array_data = + ArrayData::Make(TypeTraits::type_singleton(), + mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0); + return std::make_shared(std::move(array_data)); +} + +template +Result> ArrayOfListView(RAG& self, const Field& field, + int64_t length, int64_t alignment, + MemoryPool* memory_pool, + double null_probability) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename ArrayType::offset_type; + using OffsetArrayType = typename CTypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto zero_undefined_offsets = + GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto lengths = internal::checked_pointer_cast( + self.RAG::template Numeric( + length, min_length, max_length, null_probability)); + + int64_t max_view_end = 0; + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, &max_view_end, + alignment, memory_pool); + + const auto values = self.RAG::ArrayOf( + *internal::checked_pointer_cast(field.type())->value_field(), + /*values_length=*/max_view_end, alignment, memory_pool); + + ARROW_ASSIGN_OR_RAISE(auto list_view_array, + ArrayType::FromArrays(field.type(), *offsets, *lengths, *values)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + +template +Result> RandomListView(RAG& self, const Array& values, + int64_t length, double null_probability, + bool force_empty_nulls, double coverage, + int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename TypeClass::offset_type; + using OffsetArrayType = typename TypeTraits::OffsetArrayType; + using OffsetArrowType = typename OffsetArrayType::TypeClass; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + DCHECK_LE(length, std::numeric_limits::max()); + + auto offsets_array = GenerateOffsets>( + self.seed(), length + 1, 0, static_cast(values.length()), null_probability, + force_empty_nulls, alignment, memory_pool); + auto* offsets = offsets_array->data()->template GetValues(1); + + // The buffers for the sizes array + BufferVector buffers{2}; + buffers[0] = NULLPTR; + buffers[1] = *AllocateBuffer(sizeof(offset_type) * length, alignment, memory_pool); + auto sizes = buffers[1]->mutable_data_as(); + + // Derive sizes from offsets taking coverage into account + pcg32_fast rng(self.seed()); + using NormalDist = std::normal_distribution; + NormalDist size_dist; + for (int64_t i = 0; i < length; ++i) { + const double mean_size = coverage * (offsets[i + 1] - offsets[i]); + const double sampled_size = + std::max(0.0, size_dist(rng, NormalDist::param_type{mean_size})); + // This creates a higher probability of offset[i] + size[i] being closer or equal to + // values.length(), but that skew is acceptable for the purposes of testing. + const auto size = std::min(static_cast(std::llround(sampled_size)), + static_cast(values.length() - offsets[i])); + sizes[i] = offsets_array->IsNull(i) && force_empty_nulls ? 0 : size; + } + + auto sizes_array_data = ArrayData::Make(TypeTraits::type_singleton(), + length, std::move(buffers), /*null_count=*/0); + auto sizes_array = std::make_shared(std::move(sizes_array_data)); + + ARROW_ASSIGN_OR_RAISE( + auto list_view_array, + ArrayType::FromArrays(*offsets_array, *sizes_array, values, memory_pool)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + } // namespace std::shared_ptr RandomArrayGenerator::Offsets( @@ -637,6 +837,24 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } +std::shared_ptr RandomArrayGenerator::ListView(const Array& values, int64_t length, + double null_probability, + bool force_empty_nulls, + double coverage, int64_t alignment, + MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + +std::shared_ptr RandomArrayGenerator::LargeListView( + const Array& values, int64_t length, double null_probability, bool force_empty_nulls, + double coverage, int64_t alignment, MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, const std::shared_ptr& items, int64_t size, double null_probability, @@ -713,27 +931,6 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes); } -namespace { - -// Helper for RandomArrayGenerator::ArrayOf: extract some C value from -// a given metadata key. -template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, - const std::string& key, - T default_value) { - if (!metadata) return default_value; - const auto index = metadata->FindKey(key); - if (index < 0) return default_value; - const auto& value = metadata->value(index); - T output{}; - if (!internal::ParseValue(value.data(), value.length(), &output)) { - ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); - } - return output; -} - -} // namespace - std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, int64_t size, double null_probability, @@ -811,6 +1008,12 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \ } +#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + return *ArrayOfListView(*this, field, length, alignment, memory_pool, \ + null_probability); \ + } + const double null_probability = field.nullable() ? GetMetadata(field.metadata().get(), "null_probability", 0.01) @@ -946,6 +1149,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(ListArray); + GENERATE_LIST_VIEW_CASE(ListViewArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -1069,6 +1273,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(LargeListArray); + GENERATE_LIST_VIEW_CASE(LargeListViewArray); default: break; @@ -1077,6 +1282,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t #undef GENERATE_INTEGRAL_CASE #undef GENERATE_FLOATING_CASE #undef GENERATE_LIST_CASE +#undef GENERATE_LIST_VIEW_CASE #undef VALIDATE_RANGE #undef VALIDATE_MIN_MAX diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index cbdac3baa0109..1d97a3ada724a 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -458,6 +458,43 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random ListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr ListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + + /// \brief Generate a random LargeListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr LargeListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 951b654e56f73..a92ecf4e9c45b 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam { } bool HasList(const DataType& type) { - if (is_var_length_list(type.id())) { + if (is_var_length_list_like(type.id())) { return true; } for (const auto& child : type.fields()) { @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) { const int64_t alignment = 1024; auto field = GetField(); if (HasList(*field->type())) { - GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment"; + GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment"; } auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment); AssertTypeEqual(field->type(), array->type()); @@ -177,6 +177,13 @@ auto values = ::testing::Values( key_value_metadata({{"force_empty_nulls", "true"}})), field("listint81024values", list(int8()), true, key_value_metadata({{"values", "1024"}})), + field("listviewint8", list_view(int8())), + field("listviewlistviewint8", list_view(list_view(int8()))), + field("listviewint8emptynulls", list_view(int8()), true, + key_value_metadata( + {{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})), + field("listviewint81024values", list_view(int8()), true, + key_value_metadata({{"values", "1024"}})), field("structints", struct_({ field("int8", int8()), field("int16", int16()), @@ -201,7 +208,8 @@ auto values = ::testing::Values( field("fixedsizelist", fixed_size_list(int8(), 4)), field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), field("largebinary", large_binary()), - field("largelistlistint8", large_list(list(int8())))); + field("largelistlistint8", large_list(list(int8()))), + field("largelistviewlistviewint8", large_list_view(list_view(int8())))); INSTANTIATE_TEST_SUITE_P( TestRandomArrayGeneration, RandomArrayTest, values, @@ -400,6 +408,39 @@ TEST(TypeSpecificTests, ListLengths) { } } +TEST(TypeSpecificTests, ListViewLengths) { + { + auto field = + arrow::field("list_view", list_view(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), kExpectedLength); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list_view", large_list_view(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_EQ(array->length(), kExpectedLength); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + TEST(TypeSpecificTests, MapValues) { auto field = arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); @@ -500,6 +541,24 @@ TEST(RandomList, Basics) { } } +TEST(RandomListView, Basics) { + random::RandomArrayGenerator rng(42); + for (const double null_probability : {0.0, 0.1, 0.98}) { + SCOPED_TRACE("null_probability = " + std::to_string(null_probability)); + auto values = rng.Int16(1234, 0, 10000, null_probability); + auto array = rng.ListView(*values, 45, null_probability); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 45); + const auto& list_view_array = checked_cast(*array); + ASSERT_EQ(list_view_array.values()->length(), 1234); + int64_t null_count = 0; + for (int64_t i = 0; i < array->length(); ++i) { + null_count += array->IsNull(i); + } + ASSERT_EQ(null_count, array->data()->null_count); + } +} + TEST(RandomChildFieldNullablity, List) { random::RandomArrayGenerator rng(42); @@ -513,6 +572,19 @@ TEST(RandomChildFieldNullablity, List) { ARROW_EXPECT_OK(batch->ValidateFull()); } +TEST(RandomChildFieldNullablity, ListView) { + random::RandomArrayGenerator rng(42); + + auto item = arrow::field("item", arrow::int8(), true); + auto nest_list_view_field = arrow::field("list_view", list_view(item), false); + auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true); + auto array = rng.ArrayOf(*list_view_field, 428); + ARROW_EXPECT_OK(array->ValidateFull()); + + auto batch = rng.BatchOf({list_view_field}, 428); + ARROW_EXPECT_OK(batch->ValidateFull()); +} + TEST(RandomChildFieldNullablity, Struct) { random::RandomArrayGenerator rng(42); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f378bd974047d..62d2d61598dc8 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -140,6 +140,8 @@ std::vector AllTypeIds() { Type::STRUCT, Type::LIST, Type::LARGE_LIST, + Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::MAP, Type::DENSE_UNION, @@ -209,6 +211,8 @@ std::string ToString(Type::type id) { TO_STRING_CASE(STRUCT) TO_STRING_CASE(LIST) TO_STRING_CASE(LARGE_LIST) + TO_STRING_CASE(LIST_VIEW) + TO_STRING_CASE(LARGE_LIST_VIEW) TO_STRING_CASE(FIXED_SIZE_LIST) TO_STRING_CASE(MAP) TO_STRING_CASE(DENSE_UNION) @@ -992,6 +996,18 @@ std::string LargeListType::ToString() const { return s.str(); } +std::string ListViewType::ToString() const { + std::stringstream s; + s << "list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + +std::string LargeListViewType::ToString() const { + std::stringstream s; + s << "large_list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + MapType::MapType(std::shared_ptr key_type, std::shared_ptr item_type, bool keys_sorted) : MapType(::arrow::field("key", std::move(key_type), false), @@ -2888,6 +2904,38 @@ std::string LargeListType::ComputeFingerprint() const { return ""; } +std::string ListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + +std::string LargeListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + std::string MapType::ComputeFingerprint() const { const auto& key_fingerprint = key_type()->fingerprint(); const auto& item_fingerprint = item_type()->fingerprint(); @@ -3138,6 +3186,22 @@ std::shared_ptr fixed_size_list(const std::shared_ptr& value_fi return std::make_shared(value_field, list_size); } +std::shared_ptr list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + std::shared_ptr struct_(const FieldVector& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a905192e4a54e..5b1331ab66919 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1174,6 +1174,71 @@ class ARROW_EXPORT LargeListType : public BaseListType { std::string ComputeFingerprint() const override; }; +/// \brief Type class for array of list views +class ARROW_EXPORT ListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LIST_VIEW; + using offset_type = int32_t; + + static constexpr const char* type_name() { return "list_view"; } + + // ListView can contain any other logical value type + explicit ListViewType(const std::shared_ptr& value_type) + : ListViewType(std::make_shared("item", value_type)) {} + + explicit ListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + +/// \brief Concrete type class for large list-view data +/// +/// LargeListViewType is like ListViewType but with 64-bit rather than 32-bit offsets and +/// sizes. +class ARROW_EXPORT LargeListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST_VIEW; + using offset_type = int64_t; + + static constexpr const char* type_name() { return "large_list_view"; } + + // LargeListView can contain any other logical value type + explicit LargeListViewType(const std::shared_ptr& value_type) + : LargeListViewType(std::make_shared("item", value_type)) {} + + explicit LargeListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "large_list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for map data /// /// Map data is nested data where each value is a variable number of diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ca263b710317b..63eec10bf723b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -150,6 +150,16 @@ class LargeListArray; class LargeListBuilder; struct LargeListScalar; +class ListViewType; +class ListViewArray; +class ListViewBuilder; +struct ListViewScalar; + +class LargeListViewType; +class LargeListViewArray; +class LargeListViewBuilder; +struct LargeListViewScalar; + class MapType; class MapArray; class MapBuilder; @@ -432,6 +442,12 @@ struct Type { /// Bytes view type with 4-byte prefix and inline small string optimization BINARY_VIEW = 40, + /// A list of some logical data type represented by offset and size. + LIST_VIEW = 41, + + /// Like LIST_VIEW, but with 64-bit offsets and sizes + LARGE_LIST_VIEW = 42, + // Leave this at the end MAX_ID }; @@ -523,6 +539,19 @@ std::shared_ptr large_list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr large_list(const std::shared_ptr& value_type); +/// \brief Create a ListViewType instance +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a ListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance +ARROW_EXPORT std::shared_ptr large_list_view( + std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr large_list_view(std::shared_ptr value_type); + /// \brief Create a MapType instance from its key and value DataTypes ARROW_EXPORT std::shared_ptr map(std::shared_ptr key_type, diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 273f8933fa577..009e557f82f68 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1553,6 +1553,46 @@ TEST(TestLargeListType, Basics) { ASSERT_EQ("large_list>", lt2.ToString()); } +TEST(TestListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + ListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LIST_VIEW); + + ASSERT_EQ("list_view", list_view_type.name()); + ASSERT_EQ("list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("list_view", lt->ToString()); + + ListViewType lt2(lt); + ASSERT_EQ("list_view>", lt2.ToString()); +} + +TEST(TestLargeListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + LargeListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LARGE_LIST_VIEW); + + ASSERT_EQ("large_list_view", list_view_type.name()); + ASSERT_EQ("large_list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("large_list_view", lt->ToString()); + + LargeListViewType lt2(lt); + ASSERT_EQ("large_list_view>", lt2.ToString()); +} + TEST(TestMapType, Basics) { auto md = key_value_metadata({"foo"}, {"foo value"}); @@ -1829,6 +1869,32 @@ TEST(TestListType, Equals) { ASSERT_FALSE(list_type.Equals(list_type_named, /*check_metadata=*/true)); } +TEST(TestListViewType, Equals) { + auto t1 = list_view(utf8()); + auto t2 = list_view(utf8()); + auto t3 = list_view(binary()); + auto t4 = list_view(field("item", utf8(), /*nullable=*/false)); + auto tl1 = large_list_view(binary()); + auto tl2 = large_list_view(binary()); + auto tl3 = large_list_view(float64()); + + AssertTypeEqual(*t1, *t2); + AssertTypeNotEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t4); + AssertTypeNotEqual(*t3, *tl1); + AssertTypeEqual(*tl1, *tl2); + AssertTypeNotEqual(*tl2, *tl3); + + std::shared_ptr vt = std::make_shared(); + std::shared_ptr inner_field = std::make_shared("non_default_name", vt); + + ListViewType list_view_type(vt); + ListViewType list_view_type_named(inner_field); + + AssertTypeEqual(list_view_type, list_view_type_named); + ASSERT_FALSE(list_view_type.Equals(list_view_type_named, /*check_metadata=*/true)); +} + TEST(TestListType, Metadata) { auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); @@ -1859,6 +1925,66 @@ TEST(TestListType, Metadata) { AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); } +TEST(TestListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = list_view(f1); + auto t2 = list_view(f2); + auto t3 = list_view(f3); + auto t4 = list_view(f4); + auto t5 = list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + +TEST(TestLargeListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = large_list_view(f1); + auto t2 = large_list_view(f2); + auto t3 = large_list_view(f3); + auto t4 = large_list_view(f4); + auto t5 = large_list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + TEST(TestNestedType, Equals) { auto create_struct = [](std::string inner_name, std::string struct_name) -> std::shared_ptr { @@ -2258,6 +2384,44 @@ TEST(TypesTest, TestRunEndEncodedType) { "run_end_encoded>"); } +TEST(TypesTest, TestListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "list_view"); +} + +TEST(TypesTest, TestLargeListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = large_list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = large_list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "large_list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "large_list_view"); +} + #define TEST_PREDICATE(all_types, type_predicate) \ for (auto type : all_types) { \ ASSERT_EQ(type_predicate(type->id()), type_predicate(*type)); \ @@ -2296,6 +2460,7 @@ TEST(TypesTest, TestMembership) { TEST_PREDICATE(all_types, is_fixed_width); TEST_PREDICATE(all_types, is_var_length_list); TEST_PREDICATE(all_types, is_list_like); + TEST_PREDICATE(all_types, is_var_length_list_like); TEST_PREDICATE(all_types, is_nested); TEST_PREDICATE(all_types, is_union); } diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index de328f322ad5f..ded54aff463c1 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -67,21 +67,23 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::BINARY: // Offsets may be cast to int32_t* case Type::DATE32: case Type::TIME32: - case Type::LIST: // Offsets may be cast to int32_t*, data is in child array - case Type::MAP: // This is a list array + case Type::LIST: // Offsets may be cast to int32_t* + case Type::LIST_VIEW: // Offsets and sizes may be cast to int32_t* + case Type::MAP: // Same as LIST case Type::INTERVAL_MONTHS: // Stored as int32_t* case Type::INTERVAL_DAY_TIME: // Stored as two contiguous 32-bit integers return 4; case Type::INT64: case Type::UINT64: case Type::DOUBLE: - case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::LARGE_BINARY: // Offsets may be cast to int64_t* - case Type::LARGE_LIST: // Offsets may be cast to int64_t* - case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::LARGE_BINARY: // Offsets may be cast to int64_t* + case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::LARGE_LIST: // Offsets may be cast to int64_t* + case Type::LARGE_LIST_VIEW: // Offsets and sizes may be cast to int64_t* case Type::DATE64: case Type::TIME64: case Type::TIMESTAMP: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 9d8cafacf397b..ed66c9367dc36 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -449,6 +449,7 @@ struct TypeTraits { using OffsetBuilderType = Int32Builder; using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; + using LargeType = LargeListType; }; template <> @@ -463,6 +464,31 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = ListViewArray; + using BuilderType = ListViewBuilder; + using ScalarType = ListViewScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; + constexpr static bool is_parameter_free = false; + using LargeType = LargeListViewType; +}; + +template <> +struct TypeTraits { + using ArrayType = LargeListViewArray; + using BuilderType = LargeListViewBuilder; + using ScalarType = LargeListViewScalar; + using OffsetType = Int64Type; + using OffsetArrayType = Int64Array; + using OffsetBuilderType = Int64Builder; + using OffsetScalarType = Int64Scalar; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = MapArray; @@ -750,6 +776,13 @@ using is_list_type = template using enable_if_list_type = enable_if_t::value, R>; +template +using is_list_view_type = + std::disjunction, std::is_same>; + +template +using enable_if_list_view = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -758,6 +791,14 @@ using is_list_like_type = template using enable_if_list_like = enable_if_t::value, R>; +template +using is_var_length_list_like_type = + std::disjunction, is_list_view_type>; + +template +using enable_if_var_length_list_like = + enable_if_t::value, R>; + template using is_struct_type = std::is_base_of; @@ -1303,6 +1344,39 @@ constexpr bool is_list_like(Type::type type_id) { return false; } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a var-length list or list-view like type +constexpr bool is_var_length_list_like(Type::type type_id) { + switch (type_id) { + case Type::LIST: + case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + case Type::MAP: + return true; + default: + break; + } + return false; +} + +/// \brief Check for a list-view type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a list-view type one +constexpr bool is_list_view(Type::type type_id) { + switch (type_id) { + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a nested type /// /// \param[in] type_id the type-id to check @@ -1311,6 +1385,8 @@ constexpr bool is_nested(Type::type type_id) { switch (type_id) { case Type::LIST: case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: case Type::FIXED_SIZE_LIST: case Type::MAP: case Type::STRUCT: @@ -1403,12 +1479,14 @@ static inline int offset_bit_width(Type::type type_id) { case Type::STRING: case Type::BINARY: case Type::LIST: + case Type::LIST_VIEW: case Type::MAP: case Type::DENSE_UNION: return 32; case Type::LARGE_STRING: case Type::LARGE_BINARY: case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: return 64; default: break; @@ -1609,6 +1687,24 @@ static inline bool is_var_length_list(const DataType& type) { /// Convenience for checking using the type's id static inline bool is_list_like(const DataType& type) { return is_list_like(type.id()); } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type the type to check +/// \return whether type is a var-length list or list-view like type +/// +/// Convenience for checking using the type's id +static inline bool is_var_length_list_like(const DataType& type) { + return is_var_length_list_like(type.id()); +} + +/// \brief Check for a list-view type +/// +/// \param[in] type the type to check +/// \return whether type is a list-view type +/// +/// Convenience for checking using the type's id +static inline bool is_list_view(const DataType& type) { return is_list_view(type.id()); } + /// \brief Check for a nested type /// /// \param[in] type the type to check diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 2e9487dcf50c8..badf8a75078ed 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -55,6 +55,7 @@ add_arrow_test(utility-test int_util_test.cc ${IO_UTIL_TEST_SOURCES} iterator_test.cc + list_util_test.cc logging_test.cc queue_test.cc range_test.cc diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc new file mode 100644 index 0000000000000..15196ff8c12cf --- /dev/null +++ b/cpp/src/arrow/util/list_util.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" + +namespace arrow::list_util { + +namespace internal { + +namespace { + +using arrow::internal::checked_cast; +using arrow::internal::ReverseSetBitRunReader; +using arrow::internal::SetBitRunReader; + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +std::optional MinViewOffset(const ArraySpan& input) { + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + + // Make an access to the sizes buffer only when strictly necessary. +#define MINIMIZE_MIN_VIEW_OFFSET(i) \ + auto offset = offsets[i]; \ + if (min_offset.has_value()) { \ + if (offset < *min_offset && sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } else { \ + if (sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } + + std::optional min_offset; + if (validity == nullptr) { + for (int64_t i = 0; i < input.length; i++) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } else { + SetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position; i < run.position + run.length; ++i) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } + } + return min_offset; + +#undef MINIMIZE_MIN_VIEW_OFFSET +} + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +int64_t MaxViewEnd(const ArraySpan& input) { + const auto values_length = input.child_data[0].length; + + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + +#define MAXIMIZE_MAX_VIEW_END(i) \ + const auto offset = static_cast(offsets[i]); \ + const offset_type size = sizes[i]; \ + if (size > 0) { \ + const int64_t end = offset + size; \ + if (end > max_end) { \ + if (end == values_length) { \ + return values_length; \ + } \ + max_end = end; \ + } \ + } + + int64_t max_end = 0; + if (validity == nullptr) { + for (int64_t i = input.length - 1; i >= 0; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } else { + ReverseSetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position + run.length - 1; i >= run.position; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } + } + return max_end; + +#undef MAXIMIZE_MAX_VIEW_END +} + +template +std::pair RangeOfValuesUsedByListView(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + if (input.length == 0 || input.null_count == input.length) { + return {0, 0}; + } + const auto min_offset = MinViewOffset(input); + // If all list-views are empty, min_offset will be std::nullopt. + if (!min_offset.has_value()) { + return {0, 0}; + } + const int64_t max_end = MaxViewEnd(input); + return {*min_offset, max_end - *min_offset}; +} + +template +std::pair RangeOfValuesUsedByList(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + if (input.length == 0) { + return {0, 0}; + } + const auto* offsets = input.buffers[1].data_as(); + const int64_t min_offset = offsets[input.offset]; + const int64_t max_end = offsets[input.offset + input.length]; + return {min_offset, max_end - min_offset}; +} + +template +int64_t SumOfListSizes(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, offsets](int64_t run_start, int64_t run_length) { + sum += offsets[run_start + run_length + 1] - offsets[run_start]; + }); + return sum; +} + +template +int64_t SumOfListViewSizes(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* sizes = input.GetValues(2); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, sizes](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + sum += sizes[i]; + } + }); + return sum; +} + +} // namespace + +Result> RangeOfValuesUsed(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return RangeOfValuesUsedByList(input); + case Type::MAP: + return RangeOfValuesUsedByList(input); + case Type::LARGE_LIST: + return RangeOfValuesUsedByList(input); + case Type::LIST_VIEW: + return RangeOfValuesUsedByListView(input); + case Type::LARGE_LIST_VIEW: + return RangeOfValuesUsedByListView(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "RangeOfValuesUsed: input is not a var-length list-like array"); +} + +Result SumOfLogicalListSizes(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return SumOfListSizes(input); + case Type::MAP: + return SumOfListSizes(input); + case Type::LARGE_LIST: + return SumOfListSizes(input); + case Type::LIST_VIEW: + return SumOfListViewSizes(input); + case Type::LARGE_LIST_VIEW: + return SumOfListViewSizes(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "SumOfLogicalListSizes: input is not a var-length list-like array"); +} + +} // namespace internal + +} // namespace arrow::list_util diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h new file mode 100644 index 0000000000000..467f4eb15edb7 --- /dev/null +++ b/cpp/src/arrow/util/list_util.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/result.h" + +namespace arrow { +namespace list_util { +namespace internal { + +/// \brief Calculate the smallest continuous range of values used by the +/// var-length list-like input (list, map and list-view types). +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return A pair of (offset, length) describing the range +ARROW_EXPORT Result> RangeOfValuesUsed( + const ArraySpan& input); + +/// \brief Calculate the sum of the sizes of all valid lists or list-views +/// +/// This is usually the same as the length of the RangeOfValuesUsed() range, but +/// it can be: +/// - Smaller: when the child array constains many values that are not +/// referenced by the lists or list-views in the parent array +/// - Greater: when the list-views share child array ranges +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return The sum of all list or list-view sizes +ARROW_EXPORT Result SumOfLogicalListSizes(const ArraySpan& input); + +} // namespace internal + +} // namespace list_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc new file mode 100644 index 0000000000000..4021180b2bef3 --- /dev/null +++ b/cpp/src/arrow/util/list_util_test.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/builder_nested.h" +#include "arrow/util/list_util.h" + +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +using ListAndListViewTypes = + ::testing::Types; + +template +class TestListUtils : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_.reset(checked_cast(tmp.release())); + } + + void TestRangeOfValuesUsed() { + std::shared_ptr result; + + // These list-views are built manually with the list-view builders instead + // of using something like ArrayFromJSON() because we want to test the + // RangeOfValuesUsed() function's ability to handle arrays containing + // overlapping list-views. + + // Empty list-like array + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(auto range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // List-like array with only nulls + ASSERT_OK(builder_->AppendNulls(3)); + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // Array with nulls and non-nulls (starting at a non-zero offset) + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } + std::shared_ptr array; + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + ASSERT_EQ(range.second, 5); + + // Overlapping list-views + vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + // -- used range ends here -- + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } else { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + } + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_ARRAYS_EQUAL( + *array, *ArrayFromJSON( + type_, "[null, [0, 1, 2, 3, 4, 5], [1, 2], null, [4], null, null]")); + // Check the range + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + if constexpr (is_list_view_type::value) { + ASSERT_EQ(range.second, 6); + } else { + ASSERT_EQ(range.second, 9); + } + // Check the sum of logical sizes as well + ASSERT_OK_AND_ASSIGN(int64_t sum_of_logical_sizes, + list_util::internal::SumOfLogicalListSizes(*array->data())); + ASSERT_EQ(sum_of_logical_sizes, 9); + } + + protected: + MemoryPool* pool_ = default_memory_pool(); + std::shared_ptr type_; + std::shared_ptr value_type_; + std::shared_ptr builder_; +}; + +TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); + +TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } + +} // namespace arrow diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index e057f6b12fb1b..cca99033c9350 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -63,6 +63,8 @@ ARRAY_VISITOR_DEFAULT(MonthIntervalArray) ARRAY_VISITOR_DEFAULT(DurationArray) ARRAY_VISITOR_DEFAULT(ListArray) ARRAY_VISITOR_DEFAULT(LargeListArray) +ARRAY_VISITOR_DEFAULT(ListViewArray) +ARRAY_VISITOR_DEFAULT(LargeListViewArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) @@ -117,6 +119,8 @@ TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(Decimal256Type) TYPE_VISITOR_DEFAULT(ListType) TYPE_VISITOR_DEFAULT(LargeListType) +TYPE_VISITOR_DEFAULT(ListViewType) +TYPE_VISITOR_DEFAULT(LargeListViewType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) @@ -170,6 +174,8 @@ SCALAR_VISITOR_DEFAULT(Decimal128Scalar) SCALAR_VISITOR_DEFAULT(Decimal256Scalar) SCALAR_VISITOR_DEFAULT(ListScalar) SCALAR_VISITOR_DEFAULT(LargeListScalar) +SCALAR_VISITOR_DEFAULT(ListViewScalar) +SCALAR_VISITOR_DEFAULT(LargeListViewScalar) SCALAR_VISITOR_DEFAULT(MapScalar) SCALAR_VISITOR_DEFAULT(FixedSizeListScalar) SCALAR_VISITOR_DEFAULT(StructScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 650b0e7ee0a30..75ef46ae4e5c3 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -64,6 +64,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Decimal256Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const LargeListArray& array); + virtual Status Visit(const ListViewArray& array); + virtual Status Visit(const LargeListViewArray& array); virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); @@ -115,6 +117,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const Decimal256Type& type); virtual Status Visit(const ListType& type); virtual Status Visit(const LargeListType& type); + virtual Status Visit(const ListViewType& scalar); + virtual Status Visit(const LargeListViewType& scalar); virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); @@ -166,6 +170,8 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const Decimal256Scalar& scalar); virtual Status Visit(const ListScalar& scalar); virtual Status Visit(const LargeListScalar& scalar); + virtual Status Visit(const ListViewScalar& scalar); + virtual Status Visit(const LargeListViewScalar& scalar); virtual Status Visit(const MapScalar& scalar); virtual Status Visit(const FixedSizeListScalar& scalar); virtual Status Visit(const StructScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 4b57abe53ff14..cbb081bfed311 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -59,6 +59,8 @@ namespace arrow { ACTION(Decimal256); \ ACTION(List); \ ACTION(LargeList); \ + ACTION(ListView); \ + ACTION(LargeListView); \ ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp index 86188581e0c42..8932c4a4f8d19 100644 --- a/cpp/src/generated/parquet_types.cpp +++ b/cpp/src/generated/parquet_types.cpp @@ -1,5 +1,5 @@ /** - * Autogenerated by Thrift Compiler (0.18.1) + * Autogenerated by Thrift Compiler (0.19.0) * * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING * @generated @@ -615,6 +615,197 @@ std::string to_string(const BoundaryOrder::type& val) { } +SizeStatistics::~SizeStatistics() noexcept { +} + + +void SizeStatistics::__set_unencoded_byte_array_data_bytes(const int64_t val) { + this->unencoded_byte_array_data_bytes = val; +__isset.unencoded_byte_array_data_bytes = true; +} + +void SizeStatistics::__set_repetition_level_histogram(const std::vector & val) { + this->repetition_level_histogram = val; +__isset.repetition_level_histogram = true; +} + +void SizeStatistics::__set_definition_level_histogram(const std::vector & val) { + this->definition_level_histogram = val; +__isset.definition_level_histogram = true; +} +std::ostream& operator<<(std::ostream& out, const SizeStatistics& obj) +{ + obj.printTo(out); + return out; +} + + +uint32_t SizeStatistics::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 1: + if (ftype == ::apache::thrift::protocol::T_I64) { + xfer += iprot->readI64(this->unencoded_byte_array_data_bytes); + this->__isset.unencoded_byte_array_data_bytes = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 2: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->repetition_level_histogram.clear(); + uint32_t _size0; + ::apache::thrift::protocol::TType _etype3; + xfer += iprot->readListBegin(_etype3, _size0); + this->repetition_level_histogram.resize(_size0); + uint32_t _i4; + for (_i4 = 0; _i4 < _size0; ++_i4) + { + xfer += iprot->readI64(this->repetition_level_histogram[_i4]); + } + xfer += iprot->readListEnd(); + } + this->__isset.repetition_level_histogram = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 3: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->definition_level_histogram.clear(); + uint32_t _size5; + ::apache::thrift::protocol::TType _etype8; + xfer += iprot->readListBegin(_etype8, _size5); + this->definition_level_histogram.resize(_size5); + uint32_t _i9; + for (_i9 = 0; _i9 < _size5; ++_i9) + { + xfer += iprot->readI64(this->definition_level_histogram[_i9]); + } + xfer += iprot->readListEnd(); + } + this->__isset.definition_level_histogram = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t SizeStatistics::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("SizeStatistics"); + + if (this->__isset.unencoded_byte_array_data_bytes) { + xfer += oprot->writeFieldBegin("unencoded_byte_array_data_bytes", ::apache::thrift::protocol::T_I64, 1); + xfer += oprot->writeI64(this->unencoded_byte_array_data_bytes); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.repetition_level_histogram) { + xfer += oprot->writeFieldBegin("repetition_level_histogram", ::apache::thrift::protocol::T_LIST, 2); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->repetition_level_histogram.size())); + std::vector ::const_iterator _iter10; + for (_iter10 = this->repetition_level_histogram.begin(); _iter10 != this->repetition_level_histogram.end(); ++_iter10) + { + xfer += oprot->writeI64((*_iter10)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.definition_level_histogram) { + xfer += oprot->writeFieldBegin("definition_level_histogram", ::apache::thrift::protocol::T_LIST, 3); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->definition_level_histogram.size())); + std::vector ::const_iterator _iter11; + for (_iter11 = this->definition_level_histogram.begin(); _iter11 != this->definition_level_histogram.end(); ++_iter11) + { + xfer += oprot->writeI64((*_iter11)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + +void swap(SizeStatistics &a, SizeStatistics &b) { + using ::std::swap; + swap(a.unencoded_byte_array_data_bytes, b.unencoded_byte_array_data_bytes); + swap(a.repetition_level_histogram, b.repetition_level_histogram); + swap(a.definition_level_histogram, b.definition_level_histogram); + swap(a.__isset, b.__isset); +} + +SizeStatistics::SizeStatistics(const SizeStatistics& other12) { + unencoded_byte_array_data_bytes = other12.unencoded_byte_array_data_bytes; + repetition_level_histogram = other12.repetition_level_histogram; + definition_level_histogram = other12.definition_level_histogram; + __isset = other12.__isset; +} +SizeStatistics::SizeStatistics(SizeStatistics&& other13) noexcept { + unencoded_byte_array_data_bytes = other13.unencoded_byte_array_data_bytes; + repetition_level_histogram = std::move(other13.repetition_level_histogram); + definition_level_histogram = std::move(other13.definition_level_histogram); + __isset = other13.__isset; +} +SizeStatistics& SizeStatistics::operator=(const SizeStatistics& other14) { + unencoded_byte_array_data_bytes = other14.unencoded_byte_array_data_bytes; + repetition_level_histogram = other14.repetition_level_histogram; + definition_level_histogram = other14.definition_level_histogram; + __isset = other14.__isset; + return *this; +} +SizeStatistics& SizeStatistics::operator=(SizeStatistics&& other15) noexcept { + unencoded_byte_array_data_bytes = other15.unencoded_byte_array_data_bytes; + repetition_level_histogram = std::move(other15.repetition_level_histogram); + definition_level_histogram = std::move(other15.definition_level_histogram); + __isset = other15.__isset; + return *this; +} +void SizeStatistics::printTo(std::ostream& out) const { + using ::apache::thrift::to_string; + out << "SizeStatistics("; + out << "unencoded_byte_array_data_bytes="; (__isset.unencoded_byte_array_data_bytes ? (out << to_string(unencoded_byte_array_data_bytes)) : (out << "")); + out << ", " << "repetition_level_histogram="; (__isset.repetition_level_histogram ? (out << to_string(repetition_level_histogram)) : (out << "")); + out << ", " << "definition_level_histogram="; (__isset.definition_level_histogram ? (out << to_string(definition_level_histogram)) : (out << "")); + out << ")"; +} + + Statistics::~Statistics() noexcept { } @@ -648,6 +839,16 @@ void Statistics::__set_min_value(const std::string& val) { this->min_value = val; __isset.min_value = true; } + +void Statistics::__set_is_max_value_exact(const bool val) { + this->is_max_value_exact = val; +__isset.is_max_value_exact = true; +} + +void Statistics::__set_is_min_value_exact(const bool val) { + this->is_min_value_exact = val; +__isset.is_min_value_exact = true; +} std::ostream& operator<<(std::ostream& out, const Statistics& obj) { obj.printTo(out); @@ -724,6 +925,22 @@ uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 7: + if (ftype == ::apache::thrift::protocol::T_BOOL) { + xfer += iprot->readBool(this->is_max_value_exact); + this->__isset.is_max_value_exact = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 8: + if (ftype == ::apache::thrift::protocol::T_BOOL) { + xfer += iprot->readBool(this->is_min_value_exact); + this->__isset.is_min_value_exact = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -771,6 +988,16 @@ uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeBinary(this->min_value); xfer += oprot->writeFieldEnd(); } + if (this->__isset.is_max_value_exact) { + xfer += oprot->writeFieldBegin("is_max_value_exact", ::apache::thrift::protocol::T_BOOL, 7); + xfer += oprot->writeBool(this->is_max_value_exact); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.is_min_value_exact) { + xfer += oprot->writeFieldBegin("is_min_value_exact", ::apache::thrift::protocol::T_BOOL, 8); + xfer += oprot->writeBool(this->is_min_value_exact); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -784,45 +1011,55 @@ void swap(Statistics &a, Statistics &b) { swap(a.distinct_count, b.distinct_count); swap(a.max_value, b.max_value); swap(a.min_value, b.min_value); + swap(a.is_max_value_exact, b.is_max_value_exact); + swap(a.is_min_value_exact, b.is_min_value_exact); swap(a.__isset, b.__isset); } -Statistics::Statistics(const Statistics& other0) { - max = other0.max; - min = other0.min; - null_count = other0.null_count; - distinct_count = other0.distinct_count; - max_value = other0.max_value; - min_value = other0.min_value; - __isset = other0.__isset; -} -Statistics::Statistics(Statistics&& other1) noexcept { - max = std::move(other1.max); - min = std::move(other1.min); - null_count = other1.null_count; - distinct_count = other1.distinct_count; - max_value = std::move(other1.max_value); - min_value = std::move(other1.min_value); - __isset = other1.__isset; -} -Statistics& Statistics::operator=(const Statistics& other2) { - max = other2.max; - min = other2.min; - null_count = other2.null_count; - distinct_count = other2.distinct_count; - max_value = other2.max_value; - min_value = other2.min_value; - __isset = other2.__isset; +Statistics::Statistics(const Statistics& other16) { + max = other16.max; + min = other16.min; + null_count = other16.null_count; + distinct_count = other16.distinct_count; + max_value = other16.max_value; + min_value = other16.min_value; + is_max_value_exact = other16.is_max_value_exact; + is_min_value_exact = other16.is_min_value_exact; + __isset = other16.__isset; +} +Statistics::Statistics(Statistics&& other17) noexcept { + max = std::move(other17.max); + min = std::move(other17.min); + null_count = other17.null_count; + distinct_count = other17.distinct_count; + max_value = std::move(other17.max_value); + min_value = std::move(other17.min_value); + is_max_value_exact = other17.is_max_value_exact; + is_min_value_exact = other17.is_min_value_exact; + __isset = other17.__isset; +} +Statistics& Statistics::operator=(const Statistics& other18) { + max = other18.max; + min = other18.min; + null_count = other18.null_count; + distinct_count = other18.distinct_count; + max_value = other18.max_value; + min_value = other18.min_value; + is_max_value_exact = other18.is_max_value_exact; + is_min_value_exact = other18.is_min_value_exact; + __isset = other18.__isset; return *this; } -Statistics& Statistics::operator=(Statistics&& other3) noexcept { - max = std::move(other3.max); - min = std::move(other3.min); - null_count = other3.null_count; - distinct_count = other3.distinct_count; - max_value = std::move(other3.max_value); - min_value = std::move(other3.min_value); - __isset = other3.__isset; +Statistics& Statistics::operator=(Statistics&& other19) noexcept { + max = std::move(other19.max); + min = std::move(other19.min); + null_count = other19.null_count; + distinct_count = other19.distinct_count; + max_value = std::move(other19.max_value); + min_value = std::move(other19.min_value); + is_max_value_exact = other19.is_max_value_exact; + is_min_value_exact = other19.is_min_value_exact; + __isset = other19.__isset; return *this; } void Statistics::printTo(std::ostream& out) const { @@ -834,6 +1071,8 @@ void Statistics::printTo(std::ostream& out) const { out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "")); out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "")); out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "")); + out << ", " << "is_max_value_exact="; (__isset.is_max_value_exact ? (out << to_string(is_max_value_exact)) : (out << "")); + out << ", " << "is_min_value_exact="; (__isset.is_min_value_exact ? (out << to_string(is_min_value_exact)) : (out << "")); out << ")"; } @@ -892,18 +1131,18 @@ void swap(StringType &a, StringType &b) { (void) b; } -StringType::StringType(const StringType& other4) noexcept { - (void) other4; +StringType::StringType(const StringType& other20) noexcept { + (void) other20; } -StringType::StringType(StringType&& other5) noexcept { - (void) other5; +StringType::StringType(StringType&& other21) noexcept { + (void) other21; } -StringType& StringType::operator=(const StringType& other6) noexcept { - (void) other6; +StringType& StringType::operator=(const StringType& other22) noexcept { + (void) other22; return *this; } -StringType& StringType::operator=(StringType&& other7) noexcept { - (void) other7; +StringType& StringType::operator=(StringType&& other23) noexcept { + (void) other23; return *this; } void StringType::printTo(std::ostream& out) const { @@ -967,18 +1206,18 @@ void swap(UUIDType &a, UUIDType &b) { (void) b; } -UUIDType::UUIDType(const UUIDType& other8) noexcept { - (void) other8; +UUIDType::UUIDType(const UUIDType& other24) noexcept { + (void) other24; } -UUIDType::UUIDType(UUIDType&& other9) noexcept { - (void) other9; +UUIDType::UUIDType(UUIDType&& other25) noexcept { + (void) other25; } -UUIDType& UUIDType::operator=(const UUIDType& other10) noexcept { - (void) other10; +UUIDType& UUIDType::operator=(const UUIDType& other26) noexcept { + (void) other26; return *this; } -UUIDType& UUIDType::operator=(UUIDType&& other11) noexcept { - (void) other11; +UUIDType& UUIDType::operator=(UUIDType&& other27) noexcept { + (void) other27; return *this; } void UUIDType::printTo(std::ostream& out) const { @@ -1042,18 +1281,18 @@ void swap(MapType &a, MapType &b) { (void) b; } -MapType::MapType(const MapType& other12) noexcept { - (void) other12; +MapType::MapType(const MapType& other28) noexcept { + (void) other28; } -MapType::MapType(MapType&& other13) noexcept { - (void) other13; +MapType::MapType(MapType&& other29) noexcept { + (void) other29; } -MapType& MapType::operator=(const MapType& other14) noexcept { - (void) other14; +MapType& MapType::operator=(const MapType& other30) noexcept { + (void) other30; return *this; } -MapType& MapType::operator=(MapType&& other15) noexcept { - (void) other15; +MapType& MapType::operator=(MapType&& other31) noexcept { + (void) other31; return *this; } void MapType::printTo(std::ostream& out) const { @@ -1117,18 +1356,18 @@ void swap(ListType &a, ListType &b) { (void) b; } -ListType::ListType(const ListType& other16) noexcept { - (void) other16; +ListType::ListType(const ListType& other32) noexcept { + (void) other32; } -ListType::ListType(ListType&& other17) noexcept { - (void) other17; +ListType::ListType(ListType&& other33) noexcept { + (void) other33; } -ListType& ListType::operator=(const ListType& other18) noexcept { - (void) other18; +ListType& ListType::operator=(const ListType& other34) noexcept { + (void) other34; return *this; } -ListType& ListType::operator=(ListType&& other19) noexcept { - (void) other19; +ListType& ListType::operator=(ListType&& other35) noexcept { + (void) other35; return *this; } void ListType::printTo(std::ostream& out) const { @@ -1192,18 +1431,18 @@ void swap(EnumType &a, EnumType &b) { (void) b; } -EnumType::EnumType(const EnumType& other20) noexcept { - (void) other20; +EnumType::EnumType(const EnumType& other36) noexcept { + (void) other36; } -EnumType::EnumType(EnumType&& other21) noexcept { - (void) other21; +EnumType::EnumType(EnumType&& other37) noexcept { + (void) other37; } -EnumType& EnumType::operator=(const EnumType& other22) noexcept { - (void) other22; +EnumType& EnumType::operator=(const EnumType& other38) noexcept { + (void) other38; return *this; } -EnumType& EnumType::operator=(EnumType&& other23) noexcept { - (void) other23; +EnumType& EnumType::operator=(EnumType&& other39) noexcept { + (void) other39; return *this; } void EnumType::printTo(std::ostream& out) const { @@ -1267,18 +1506,18 @@ void swap(DateType &a, DateType &b) { (void) b; } -DateType::DateType(const DateType& other24) noexcept { - (void) other24; +DateType::DateType(const DateType& other40) noexcept { + (void) other40; } -DateType::DateType(DateType&& other25) noexcept { - (void) other25; +DateType::DateType(DateType&& other41) noexcept { + (void) other41; } -DateType& DateType::operator=(const DateType& other26) noexcept { - (void) other26; +DateType& DateType::operator=(const DateType& other42) noexcept { + (void) other42; return *this; } -DateType& DateType::operator=(DateType&& other27) noexcept { - (void) other27; +DateType& DateType::operator=(DateType&& other43) noexcept { + (void) other43; return *this; } void DateType::printTo(std::ostream& out) const { @@ -1342,18 +1581,18 @@ void swap(Float16Type &a, Float16Type &b) { (void) b; } -Float16Type::Float16Type(const Float16Type& other28) noexcept { - (void) other28; +Float16Type::Float16Type(const Float16Type& other44) noexcept { + (void) other44; } -Float16Type::Float16Type(Float16Type&& other29) noexcept { - (void) other29; +Float16Type::Float16Type(Float16Type&& other45) noexcept { + (void) other45; } -Float16Type& Float16Type::operator=(const Float16Type& other30) noexcept { - (void) other30; +Float16Type& Float16Type::operator=(const Float16Type& other46) noexcept { + (void) other46; return *this; } -Float16Type& Float16Type::operator=(Float16Type&& other31) noexcept { - (void) other31; +Float16Type& Float16Type::operator=(Float16Type&& other47) noexcept { + (void) other47; return *this; } void Float16Type::printTo(std::ostream& out) const { @@ -1417,18 +1656,18 @@ void swap(NullType &a, NullType &b) { (void) b; } -NullType::NullType(const NullType& other32) noexcept { - (void) other32; +NullType::NullType(const NullType& other48) noexcept { + (void) other48; } -NullType::NullType(NullType&& other33) noexcept { - (void) other33; +NullType::NullType(NullType&& other49) noexcept { + (void) other49; } -NullType& NullType::operator=(const NullType& other34) noexcept { - (void) other34; +NullType& NullType::operator=(const NullType& other50) noexcept { + (void) other50; return *this; } -NullType& NullType::operator=(NullType&& other35) noexcept { - (void) other35; +NullType& NullType::operator=(NullType&& other51) noexcept { + (void) other51; return *this; } void NullType::printTo(std::ostream& out) const { @@ -1535,22 +1774,22 @@ void swap(DecimalType &a, DecimalType &b) { swap(a.precision, b.precision); } -DecimalType::DecimalType(const DecimalType& other36) noexcept { - scale = other36.scale; - precision = other36.precision; +DecimalType::DecimalType(const DecimalType& other52) noexcept { + scale = other52.scale; + precision = other52.precision; } -DecimalType::DecimalType(DecimalType&& other37) noexcept { - scale = other37.scale; - precision = other37.precision; +DecimalType::DecimalType(DecimalType&& other53) noexcept { + scale = other53.scale; + precision = other53.precision; } -DecimalType& DecimalType::operator=(const DecimalType& other38) noexcept { - scale = other38.scale; - precision = other38.precision; +DecimalType& DecimalType::operator=(const DecimalType& other54) noexcept { + scale = other54.scale; + precision = other54.precision; return *this; } -DecimalType& DecimalType::operator=(DecimalType&& other39) noexcept { - scale = other39.scale; - precision = other39.precision; +DecimalType& DecimalType::operator=(DecimalType&& other55) noexcept { + scale = other55.scale; + precision = other55.precision; return *this; } void DecimalType::printTo(std::ostream& out) const { @@ -1616,18 +1855,18 @@ void swap(MilliSeconds &a, MilliSeconds &b) { (void) b; } -MilliSeconds::MilliSeconds(const MilliSeconds& other40) noexcept { - (void) other40; +MilliSeconds::MilliSeconds(const MilliSeconds& other56) noexcept { + (void) other56; } -MilliSeconds::MilliSeconds(MilliSeconds&& other41) noexcept { - (void) other41; +MilliSeconds::MilliSeconds(MilliSeconds&& other57) noexcept { + (void) other57; } -MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other42) noexcept { - (void) other42; +MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other58) noexcept { + (void) other58; return *this; } -MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other43) noexcept { - (void) other43; +MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other59) noexcept { + (void) other59; return *this; } void MilliSeconds::printTo(std::ostream& out) const { @@ -1691,18 +1930,18 @@ void swap(MicroSeconds &a, MicroSeconds &b) { (void) b; } -MicroSeconds::MicroSeconds(const MicroSeconds& other44) noexcept { - (void) other44; +MicroSeconds::MicroSeconds(const MicroSeconds& other60) noexcept { + (void) other60; } -MicroSeconds::MicroSeconds(MicroSeconds&& other45) noexcept { - (void) other45; +MicroSeconds::MicroSeconds(MicroSeconds&& other61) noexcept { + (void) other61; } -MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other46) noexcept { - (void) other46; +MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other62) noexcept { + (void) other62; return *this; } -MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other47) noexcept { - (void) other47; +MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other63) noexcept { + (void) other63; return *this; } void MicroSeconds::printTo(std::ostream& out) const { @@ -1766,18 +2005,18 @@ void swap(NanoSeconds &a, NanoSeconds &b) { (void) b; } -NanoSeconds::NanoSeconds(const NanoSeconds& other48) noexcept { - (void) other48; +NanoSeconds::NanoSeconds(const NanoSeconds& other64) noexcept { + (void) other64; } -NanoSeconds::NanoSeconds(NanoSeconds&& other49) noexcept { - (void) other49; +NanoSeconds::NanoSeconds(NanoSeconds&& other65) noexcept { + (void) other65; } -NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other50) noexcept { - (void) other50; +NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other66) noexcept { + (void) other66; return *this; } -NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other51) noexcept { - (void) other51; +NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other67) noexcept { + (void) other67; return *this; } void NanoSeconds::printTo(std::ostream& out) const { @@ -1902,30 +2141,30 @@ void swap(TimeUnit &a, TimeUnit &b) { swap(a.__isset, b.__isset); } -TimeUnit::TimeUnit(const TimeUnit& other52) noexcept { - MILLIS = other52.MILLIS; - MICROS = other52.MICROS; - NANOS = other52.NANOS; - __isset = other52.__isset; +TimeUnit::TimeUnit(const TimeUnit& other68) noexcept { + MILLIS = other68.MILLIS; + MICROS = other68.MICROS; + NANOS = other68.NANOS; + __isset = other68.__isset; } -TimeUnit::TimeUnit(TimeUnit&& other53) noexcept { - MILLIS = std::move(other53.MILLIS); - MICROS = std::move(other53.MICROS); - NANOS = std::move(other53.NANOS); - __isset = other53.__isset; +TimeUnit::TimeUnit(TimeUnit&& other69) noexcept { + MILLIS = std::move(other69.MILLIS); + MICROS = std::move(other69.MICROS); + NANOS = std::move(other69.NANOS); + __isset = other69.__isset; } -TimeUnit& TimeUnit::operator=(const TimeUnit& other54) noexcept { - MILLIS = other54.MILLIS; - MICROS = other54.MICROS; - NANOS = other54.NANOS; - __isset = other54.__isset; +TimeUnit& TimeUnit::operator=(const TimeUnit& other70) noexcept { + MILLIS = other70.MILLIS; + MICROS = other70.MICROS; + NANOS = other70.NANOS; + __isset = other70.__isset; return *this; } -TimeUnit& TimeUnit::operator=(TimeUnit&& other55) noexcept { - MILLIS = std::move(other55.MILLIS); - MICROS = std::move(other55.MICROS); - NANOS = std::move(other55.NANOS); - __isset = other55.__isset; +TimeUnit& TimeUnit::operator=(TimeUnit&& other71) noexcept { + MILLIS = std::move(other71.MILLIS); + MICROS = std::move(other71.MICROS); + NANOS = std::move(other71.NANOS); + __isset = other71.__isset; return *this; } void TimeUnit::printTo(std::ostream& out) const { @@ -2035,22 +2274,22 @@ void swap(TimestampType &a, TimestampType &b) { swap(a.unit, b.unit); } -TimestampType::TimestampType(const TimestampType& other56) noexcept { - isAdjustedToUTC = other56.isAdjustedToUTC; - unit = other56.unit; +TimestampType::TimestampType(const TimestampType& other72) noexcept { + isAdjustedToUTC = other72.isAdjustedToUTC; + unit = other72.unit; } -TimestampType::TimestampType(TimestampType&& other57) noexcept { - isAdjustedToUTC = other57.isAdjustedToUTC; - unit = std::move(other57.unit); +TimestampType::TimestampType(TimestampType&& other73) noexcept { + isAdjustedToUTC = other73.isAdjustedToUTC; + unit = std::move(other73.unit); } -TimestampType& TimestampType::operator=(const TimestampType& other58) noexcept { - isAdjustedToUTC = other58.isAdjustedToUTC; - unit = other58.unit; +TimestampType& TimestampType::operator=(const TimestampType& other74) noexcept { + isAdjustedToUTC = other74.isAdjustedToUTC; + unit = other74.unit; return *this; } -TimestampType& TimestampType::operator=(TimestampType&& other59) noexcept { - isAdjustedToUTC = other59.isAdjustedToUTC; - unit = std::move(other59.unit); +TimestampType& TimestampType::operator=(TimestampType&& other75) noexcept { + isAdjustedToUTC = other75.isAdjustedToUTC; + unit = std::move(other75.unit); return *this; } void TimestampType::printTo(std::ostream& out) const { @@ -2159,22 +2398,22 @@ void swap(TimeType &a, TimeType &b) { swap(a.unit, b.unit); } -TimeType::TimeType(const TimeType& other60) noexcept { - isAdjustedToUTC = other60.isAdjustedToUTC; - unit = other60.unit; +TimeType::TimeType(const TimeType& other76) noexcept { + isAdjustedToUTC = other76.isAdjustedToUTC; + unit = other76.unit; } -TimeType::TimeType(TimeType&& other61) noexcept { - isAdjustedToUTC = other61.isAdjustedToUTC; - unit = std::move(other61.unit); +TimeType::TimeType(TimeType&& other77) noexcept { + isAdjustedToUTC = other77.isAdjustedToUTC; + unit = std::move(other77.unit); } -TimeType& TimeType::operator=(const TimeType& other62) noexcept { - isAdjustedToUTC = other62.isAdjustedToUTC; - unit = other62.unit; +TimeType& TimeType::operator=(const TimeType& other78) noexcept { + isAdjustedToUTC = other78.isAdjustedToUTC; + unit = other78.unit; return *this; } -TimeType& TimeType::operator=(TimeType&& other63) noexcept { - isAdjustedToUTC = other63.isAdjustedToUTC; - unit = std::move(other63.unit); +TimeType& TimeType::operator=(TimeType&& other79) noexcept { + isAdjustedToUTC = other79.isAdjustedToUTC; + unit = std::move(other79.unit); return *this; } void TimeType::printTo(std::ostream& out) const { @@ -2283,22 +2522,22 @@ void swap(IntType &a, IntType &b) { swap(a.isSigned, b.isSigned); } -IntType::IntType(const IntType& other64) noexcept { - bitWidth = other64.bitWidth; - isSigned = other64.isSigned; +IntType::IntType(const IntType& other80) noexcept { + bitWidth = other80.bitWidth; + isSigned = other80.isSigned; } -IntType::IntType(IntType&& other65) noexcept { - bitWidth = other65.bitWidth; - isSigned = other65.isSigned; +IntType::IntType(IntType&& other81) noexcept { + bitWidth = other81.bitWidth; + isSigned = other81.isSigned; } -IntType& IntType::operator=(const IntType& other66) noexcept { - bitWidth = other66.bitWidth; - isSigned = other66.isSigned; +IntType& IntType::operator=(const IntType& other82) noexcept { + bitWidth = other82.bitWidth; + isSigned = other82.isSigned; return *this; } -IntType& IntType::operator=(IntType&& other67) noexcept { - bitWidth = other67.bitWidth; - isSigned = other67.isSigned; +IntType& IntType::operator=(IntType&& other83) noexcept { + bitWidth = other83.bitWidth; + isSigned = other83.isSigned; return *this; } void IntType::printTo(std::ostream& out) const { @@ -2364,18 +2603,18 @@ void swap(JsonType &a, JsonType &b) { (void) b; } -JsonType::JsonType(const JsonType& other68) noexcept { - (void) other68; +JsonType::JsonType(const JsonType& other84) noexcept { + (void) other84; } -JsonType::JsonType(JsonType&& other69) noexcept { - (void) other69; +JsonType::JsonType(JsonType&& other85) noexcept { + (void) other85; } -JsonType& JsonType::operator=(const JsonType& other70) noexcept { - (void) other70; +JsonType& JsonType::operator=(const JsonType& other86) noexcept { + (void) other86; return *this; } -JsonType& JsonType::operator=(JsonType&& other71) noexcept { - (void) other71; +JsonType& JsonType::operator=(JsonType&& other87) noexcept { + (void) other87; return *this; } void JsonType::printTo(std::ostream& out) const { @@ -2439,18 +2678,18 @@ void swap(BsonType &a, BsonType &b) { (void) b; } -BsonType::BsonType(const BsonType& other72) noexcept { - (void) other72; +BsonType::BsonType(const BsonType& other88) noexcept { + (void) other88; } -BsonType::BsonType(BsonType&& other73) noexcept { - (void) other73; +BsonType::BsonType(BsonType&& other89) noexcept { + (void) other89; } -BsonType& BsonType::operator=(const BsonType& other74) noexcept { - (void) other74; +BsonType& BsonType::operator=(const BsonType& other90) noexcept { + (void) other90; return *this; } -BsonType& BsonType::operator=(BsonType&& other75) noexcept { - (void) other75; +BsonType& BsonType::operator=(BsonType&& other91) noexcept { + (void) other91; return *this; } void BsonType::printTo(std::ostream& out) const { @@ -2784,74 +3023,74 @@ void swap(LogicalType &a, LogicalType &b) { swap(a.__isset, b.__isset); } -LogicalType::LogicalType(const LogicalType& other76) noexcept { - STRING = other76.STRING; - MAP = other76.MAP; - LIST = other76.LIST; - ENUM = other76.ENUM; - DECIMAL = other76.DECIMAL; - DATE = other76.DATE; - TIME = other76.TIME; - TIMESTAMP = other76.TIMESTAMP; - INTEGER = other76.INTEGER; - UNKNOWN = other76.UNKNOWN; - JSON = other76.JSON; - BSON = other76.BSON; - UUID = other76.UUID; - FLOAT16 = other76.FLOAT16; - __isset = other76.__isset; -} -LogicalType::LogicalType(LogicalType&& other77) noexcept { - STRING = std::move(other77.STRING); - MAP = std::move(other77.MAP); - LIST = std::move(other77.LIST); - ENUM = std::move(other77.ENUM); - DECIMAL = std::move(other77.DECIMAL); - DATE = std::move(other77.DATE); - TIME = std::move(other77.TIME); - TIMESTAMP = std::move(other77.TIMESTAMP); - INTEGER = std::move(other77.INTEGER); - UNKNOWN = std::move(other77.UNKNOWN); - JSON = std::move(other77.JSON); - BSON = std::move(other77.BSON); - UUID = std::move(other77.UUID); - FLOAT16 = std::move(other77.FLOAT16); - __isset = other77.__isset; -} -LogicalType& LogicalType::operator=(const LogicalType& other78) noexcept { - STRING = other78.STRING; - MAP = other78.MAP; - LIST = other78.LIST; - ENUM = other78.ENUM; - DECIMAL = other78.DECIMAL; - DATE = other78.DATE; - TIME = other78.TIME; - TIMESTAMP = other78.TIMESTAMP; - INTEGER = other78.INTEGER; - UNKNOWN = other78.UNKNOWN; - JSON = other78.JSON; - BSON = other78.BSON; - UUID = other78.UUID; - FLOAT16 = other78.FLOAT16; - __isset = other78.__isset; +LogicalType::LogicalType(const LogicalType& other92) noexcept { + STRING = other92.STRING; + MAP = other92.MAP; + LIST = other92.LIST; + ENUM = other92.ENUM; + DECIMAL = other92.DECIMAL; + DATE = other92.DATE; + TIME = other92.TIME; + TIMESTAMP = other92.TIMESTAMP; + INTEGER = other92.INTEGER; + UNKNOWN = other92.UNKNOWN; + JSON = other92.JSON; + BSON = other92.BSON; + UUID = other92.UUID; + FLOAT16 = other92.FLOAT16; + __isset = other92.__isset; +} +LogicalType::LogicalType(LogicalType&& other93) noexcept { + STRING = std::move(other93.STRING); + MAP = std::move(other93.MAP); + LIST = std::move(other93.LIST); + ENUM = std::move(other93.ENUM); + DECIMAL = std::move(other93.DECIMAL); + DATE = std::move(other93.DATE); + TIME = std::move(other93.TIME); + TIMESTAMP = std::move(other93.TIMESTAMP); + INTEGER = std::move(other93.INTEGER); + UNKNOWN = std::move(other93.UNKNOWN); + JSON = std::move(other93.JSON); + BSON = std::move(other93.BSON); + UUID = std::move(other93.UUID); + FLOAT16 = std::move(other93.FLOAT16); + __isset = other93.__isset; +} +LogicalType& LogicalType::operator=(const LogicalType& other94) noexcept { + STRING = other94.STRING; + MAP = other94.MAP; + LIST = other94.LIST; + ENUM = other94.ENUM; + DECIMAL = other94.DECIMAL; + DATE = other94.DATE; + TIME = other94.TIME; + TIMESTAMP = other94.TIMESTAMP; + INTEGER = other94.INTEGER; + UNKNOWN = other94.UNKNOWN; + JSON = other94.JSON; + BSON = other94.BSON; + UUID = other94.UUID; + FLOAT16 = other94.FLOAT16; + __isset = other94.__isset; return *this; } -LogicalType& LogicalType::operator=(LogicalType&& other79) noexcept { - STRING = std::move(other79.STRING); - MAP = std::move(other79.MAP); - LIST = std::move(other79.LIST); - ENUM = std::move(other79.ENUM); - DECIMAL = std::move(other79.DECIMAL); - DATE = std::move(other79.DATE); - TIME = std::move(other79.TIME); - TIMESTAMP = std::move(other79.TIMESTAMP); - INTEGER = std::move(other79.INTEGER); - UNKNOWN = std::move(other79.UNKNOWN); - JSON = std::move(other79.JSON); - BSON = std::move(other79.BSON); - UUID = std::move(other79.UUID); - FLOAT16 = std::move(other79.FLOAT16); - __isset = other79.__isset; +LogicalType& LogicalType::operator=(LogicalType&& other95) noexcept { + STRING = std::move(other95.STRING); + MAP = std::move(other95.MAP); + LIST = std::move(other95.LIST); + ENUM = std::move(other95.ENUM); + DECIMAL = std::move(other95.DECIMAL); + DATE = std::move(other95.DATE); + TIME = std::move(other95.TIME); + TIMESTAMP = std::move(other95.TIMESTAMP); + INTEGER = std::move(other95.INTEGER); + UNKNOWN = std::move(other95.UNKNOWN); + JSON = std::move(other95.JSON); + BSON = std::move(other95.BSON); + UUID = std::move(other95.UUID); + FLOAT16 = std::move(other95.FLOAT16); + __isset = other95.__isset; return *this; } void LogicalType::printTo(std::ostream& out) const { @@ -2958,9 +3197,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast80; - xfer += iprot->readI32(ecast80); - this->type = static_cast(ecast80); + int32_t ecast96; + xfer += iprot->readI32(ecast96); + this->type = static_cast(ecast96); this->__isset.type = true; } else { xfer += iprot->skip(ftype); @@ -2976,9 +3215,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast81; - xfer += iprot->readI32(ecast81); - this->repetition_type = static_cast(ecast81); + int32_t ecast97; + xfer += iprot->readI32(ecast97); + this->repetition_type = static_cast(ecast97); this->__isset.repetition_type = true; } else { xfer += iprot->skip(ftype); @@ -3002,9 +3241,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 6: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast82; - xfer += iprot->readI32(ecast82); - this->converted_type = static_cast(ecast82); + int32_t ecast98; + xfer += iprot->readI32(ecast98); + this->converted_type = static_cast(ecast98); this->__isset.converted_type = true; } else { xfer += iprot->skip(ftype); @@ -3130,58 +3369,58 @@ void swap(SchemaElement &a, SchemaElement &b) { swap(a.__isset, b.__isset); } -SchemaElement::SchemaElement(const SchemaElement& other83) { - type = other83.type; - type_length = other83.type_length; - repetition_type = other83.repetition_type; - name = other83.name; - num_children = other83.num_children; - converted_type = other83.converted_type; - scale = other83.scale; - precision = other83.precision; - field_id = other83.field_id; - logicalType = other83.logicalType; - __isset = other83.__isset; -} -SchemaElement::SchemaElement(SchemaElement&& other84) noexcept { - type = other84.type; - type_length = other84.type_length; - repetition_type = other84.repetition_type; - name = std::move(other84.name); - num_children = other84.num_children; - converted_type = other84.converted_type; - scale = other84.scale; - precision = other84.precision; - field_id = other84.field_id; - logicalType = std::move(other84.logicalType); - __isset = other84.__isset; -} -SchemaElement& SchemaElement::operator=(const SchemaElement& other85) { - type = other85.type; - type_length = other85.type_length; - repetition_type = other85.repetition_type; - name = other85.name; - num_children = other85.num_children; - converted_type = other85.converted_type; - scale = other85.scale; - precision = other85.precision; - field_id = other85.field_id; - logicalType = other85.logicalType; - __isset = other85.__isset; +SchemaElement::SchemaElement(const SchemaElement& other99) { + type = other99.type; + type_length = other99.type_length; + repetition_type = other99.repetition_type; + name = other99.name; + num_children = other99.num_children; + converted_type = other99.converted_type; + scale = other99.scale; + precision = other99.precision; + field_id = other99.field_id; + logicalType = other99.logicalType; + __isset = other99.__isset; +} +SchemaElement::SchemaElement(SchemaElement&& other100) noexcept { + type = other100.type; + type_length = other100.type_length; + repetition_type = other100.repetition_type; + name = std::move(other100.name); + num_children = other100.num_children; + converted_type = other100.converted_type; + scale = other100.scale; + precision = other100.precision; + field_id = other100.field_id; + logicalType = std::move(other100.logicalType); + __isset = other100.__isset; +} +SchemaElement& SchemaElement::operator=(const SchemaElement& other101) { + type = other101.type; + type_length = other101.type_length; + repetition_type = other101.repetition_type; + name = other101.name; + num_children = other101.num_children; + converted_type = other101.converted_type; + scale = other101.scale; + precision = other101.precision; + field_id = other101.field_id; + logicalType = other101.logicalType; + __isset = other101.__isset; return *this; } -SchemaElement& SchemaElement::operator=(SchemaElement&& other86) noexcept { - type = other86.type; - type_length = other86.type_length; - repetition_type = other86.repetition_type; - name = std::move(other86.name); - num_children = other86.num_children; - converted_type = other86.converted_type; - scale = other86.scale; - precision = other86.precision; - field_id = other86.field_id; - logicalType = std::move(other86.logicalType); - __isset = other86.__isset; +SchemaElement& SchemaElement::operator=(SchemaElement&& other102) noexcept { + type = other102.type; + type_length = other102.type_length; + repetition_type = other102.repetition_type; + name = std::move(other102.name); + num_children = other102.num_children; + converted_type = other102.converted_type; + scale = other102.scale; + precision = other102.precision; + field_id = other102.field_id; + logicalType = std::move(other102.logicalType); + __isset = other102.__isset; return *this; } void SchemaElement::printTo(std::ostream& out) const { @@ -3267,9 +3506,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast87; - xfer += iprot->readI32(ecast87); - this->encoding = static_cast(ecast87); + int32_t ecast103; + xfer += iprot->readI32(ecast103); + this->encoding = static_cast(ecast103); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3277,9 +3516,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast88; - xfer += iprot->readI32(ecast88); - this->definition_level_encoding = static_cast(ecast88); + int32_t ecast104; + xfer += iprot->readI32(ecast104); + this->definition_level_encoding = static_cast(ecast104); isset_definition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3287,9 +3526,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast89; - xfer += iprot->readI32(ecast89); - this->repetition_level_encoding = static_cast(ecast89); + int32_t ecast105; + xfer += iprot->readI32(ecast105); + this->repetition_level_encoding = static_cast(ecast105); isset_repetition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3364,38 +3603,38 @@ void swap(DataPageHeader &a, DataPageHeader &b) { swap(a.__isset, b.__isset); } -DataPageHeader::DataPageHeader(const DataPageHeader& other90) { - num_values = other90.num_values; - encoding = other90.encoding; - definition_level_encoding = other90.definition_level_encoding; - repetition_level_encoding = other90.repetition_level_encoding; - statistics = other90.statistics; - __isset = other90.__isset; -} -DataPageHeader::DataPageHeader(DataPageHeader&& other91) noexcept { - num_values = other91.num_values; - encoding = other91.encoding; - definition_level_encoding = other91.definition_level_encoding; - repetition_level_encoding = other91.repetition_level_encoding; - statistics = std::move(other91.statistics); - __isset = other91.__isset; -} -DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other92) { - num_values = other92.num_values; - encoding = other92.encoding; - definition_level_encoding = other92.definition_level_encoding; - repetition_level_encoding = other92.repetition_level_encoding; - statistics = other92.statistics; - __isset = other92.__isset; +DataPageHeader::DataPageHeader(const DataPageHeader& other106) { + num_values = other106.num_values; + encoding = other106.encoding; + definition_level_encoding = other106.definition_level_encoding; + repetition_level_encoding = other106.repetition_level_encoding; + statistics = other106.statistics; + __isset = other106.__isset; +} +DataPageHeader::DataPageHeader(DataPageHeader&& other107) noexcept { + num_values = other107.num_values; + encoding = other107.encoding; + definition_level_encoding = other107.definition_level_encoding; + repetition_level_encoding = other107.repetition_level_encoding; + statistics = std::move(other107.statistics); + __isset = other107.__isset; +} +DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other108) { + num_values = other108.num_values; + encoding = other108.encoding; + definition_level_encoding = other108.definition_level_encoding; + repetition_level_encoding = other108.repetition_level_encoding; + statistics = other108.statistics; + __isset = other108.__isset; return *this; } -DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other93) noexcept { - num_values = other93.num_values; - encoding = other93.encoding; - definition_level_encoding = other93.definition_level_encoding; - repetition_level_encoding = other93.repetition_level_encoding; - statistics = std::move(other93.statistics); - __isset = other93.__isset; +DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other109) noexcept { + num_values = other109.num_values; + encoding = other109.encoding; + definition_level_encoding = other109.definition_level_encoding; + repetition_level_encoding = other109.repetition_level_encoding; + statistics = std::move(other109.statistics); + __isset = other109.__isset; return *this; } void DataPageHeader::printTo(std::ostream& out) const { @@ -3464,18 +3703,18 @@ void swap(IndexPageHeader &a, IndexPageHeader &b) { (void) b; } -IndexPageHeader::IndexPageHeader(const IndexPageHeader& other94) noexcept { - (void) other94; +IndexPageHeader::IndexPageHeader(const IndexPageHeader& other110) noexcept { + (void) other110; } -IndexPageHeader::IndexPageHeader(IndexPageHeader&& other95) noexcept { - (void) other95; +IndexPageHeader::IndexPageHeader(IndexPageHeader&& other111) noexcept { + (void) other111; } -IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other96) noexcept { - (void) other96; +IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other112) noexcept { + (void) other112; return *this; } -IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other97) noexcept { - (void) other97; +IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other113) noexcept { + (void) other113; return *this; } void IndexPageHeader::printTo(std::ostream& out) const { @@ -3541,9 +3780,9 @@ uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast98; - xfer += iprot->readI32(ecast98); - this->encoding = static_cast(ecast98); + int32_t ecast114; + xfer += iprot->readI32(ecast114); + this->encoding = static_cast(ecast114); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3604,30 +3843,30 @@ void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) { swap(a.__isset, b.__isset); } -DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other99) noexcept { - num_values = other99.num_values; - encoding = other99.encoding; - is_sorted = other99.is_sorted; - __isset = other99.__isset; -} -DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other100) noexcept { - num_values = other100.num_values; - encoding = other100.encoding; - is_sorted = other100.is_sorted; - __isset = other100.__isset; +DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other115) noexcept { + num_values = other115.num_values; + encoding = other115.encoding; + is_sorted = other115.is_sorted; + __isset = other115.__isset; } -DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other101) noexcept { - num_values = other101.num_values; - encoding = other101.encoding; - is_sorted = other101.is_sorted; - __isset = other101.__isset; +DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other116) noexcept { + num_values = other116.num_values; + encoding = other116.encoding; + is_sorted = other116.is_sorted; + __isset = other116.__isset; +} +DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other117) noexcept { + num_values = other117.num_values; + encoding = other117.encoding; + is_sorted = other117.is_sorted; + __isset = other117.__isset; return *this; } -DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other102) noexcept { - num_values = other102.num_values; - encoding = other102.encoding; - is_sorted = other102.is_sorted; - __isset = other102.__isset; +DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other118) noexcept { + num_values = other118.num_values; + encoding = other118.encoding; + is_sorted = other118.is_sorted; + __isset = other118.__isset; return *this; } void DictionaryPageHeader::printTo(std::ostream& out) const { @@ -3737,9 +3976,9 @@ uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast103; - xfer += iprot->readI32(ecast103); - this->encoding = static_cast(ecast103); + int32_t ecast119; + xfer += iprot->readI32(ecast119); + this->encoding = static_cast(ecast119); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3858,50 +4097,50 @@ void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) { swap(a.__isset, b.__isset); } -DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other104) { - num_values = other104.num_values; - num_nulls = other104.num_nulls; - num_rows = other104.num_rows; - encoding = other104.encoding; - definition_levels_byte_length = other104.definition_levels_byte_length; - repetition_levels_byte_length = other104.repetition_levels_byte_length; - is_compressed = other104.is_compressed; - statistics = other104.statistics; - __isset = other104.__isset; -} -DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other105) noexcept { - num_values = other105.num_values; - num_nulls = other105.num_nulls; - num_rows = other105.num_rows; - encoding = other105.encoding; - definition_levels_byte_length = other105.definition_levels_byte_length; - repetition_levels_byte_length = other105.repetition_levels_byte_length; - is_compressed = other105.is_compressed; - statistics = std::move(other105.statistics); - __isset = other105.__isset; -} -DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other106) { - num_values = other106.num_values; - num_nulls = other106.num_nulls; - num_rows = other106.num_rows; - encoding = other106.encoding; - definition_levels_byte_length = other106.definition_levels_byte_length; - repetition_levels_byte_length = other106.repetition_levels_byte_length; - is_compressed = other106.is_compressed; - statistics = other106.statistics; - __isset = other106.__isset; +DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other120) { + num_values = other120.num_values; + num_nulls = other120.num_nulls; + num_rows = other120.num_rows; + encoding = other120.encoding; + definition_levels_byte_length = other120.definition_levels_byte_length; + repetition_levels_byte_length = other120.repetition_levels_byte_length; + is_compressed = other120.is_compressed; + statistics = other120.statistics; + __isset = other120.__isset; +} +DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other121) noexcept { + num_values = other121.num_values; + num_nulls = other121.num_nulls; + num_rows = other121.num_rows; + encoding = other121.encoding; + definition_levels_byte_length = other121.definition_levels_byte_length; + repetition_levels_byte_length = other121.repetition_levels_byte_length; + is_compressed = other121.is_compressed; + statistics = std::move(other121.statistics); + __isset = other121.__isset; +} +DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other122) { + num_values = other122.num_values; + num_nulls = other122.num_nulls; + num_rows = other122.num_rows; + encoding = other122.encoding; + definition_levels_byte_length = other122.definition_levels_byte_length; + repetition_levels_byte_length = other122.repetition_levels_byte_length; + is_compressed = other122.is_compressed; + statistics = other122.statistics; + __isset = other122.__isset; return *this; } -DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other107) noexcept { - num_values = other107.num_values; - num_nulls = other107.num_nulls; - num_rows = other107.num_rows; - encoding = other107.encoding; - definition_levels_byte_length = other107.definition_levels_byte_length; - repetition_levels_byte_length = other107.repetition_levels_byte_length; - is_compressed = other107.is_compressed; - statistics = std::move(other107.statistics); - __isset = other107.__isset; +DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other123) noexcept { + num_values = other123.num_values; + num_nulls = other123.num_nulls; + num_rows = other123.num_rows; + encoding = other123.encoding; + definition_levels_byte_length = other123.definition_levels_byte_length; + repetition_levels_byte_length = other123.repetition_levels_byte_length; + is_compressed = other123.is_compressed; + statistics = std::move(other123.statistics); + __isset = other123.__isset; return *this; } void DataPageHeaderV2::printTo(std::ostream& out) const { @@ -3973,18 +4212,18 @@ void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) { (void) b; } -SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other108) noexcept { - (void) other108; +SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other124) noexcept { + (void) other124; } -SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other109) noexcept { - (void) other109; +SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other125) noexcept { + (void) other125; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other110) noexcept { - (void) other110; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other126) noexcept { + (void) other126; return *this; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other111) noexcept { - (void) other111; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other127) noexcept { + (void) other127; return *this; } void SplitBlockAlgorithm::printTo(std::ostream& out) const { @@ -4071,22 +4310,22 @@ void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) { swap(a.__isset, b.__isset); } -BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other112) noexcept { - BLOCK = other112.BLOCK; - __isset = other112.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other128) noexcept { + BLOCK = other128.BLOCK; + __isset = other128.__isset; } -BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other113) noexcept { - BLOCK = std::move(other113.BLOCK); - __isset = other113.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other129) noexcept { + BLOCK = std::move(other129.BLOCK); + __isset = other129.__isset; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other114) noexcept { - BLOCK = other114.BLOCK; - __isset = other114.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other130) noexcept { + BLOCK = other130.BLOCK; + __isset = other130.__isset; return *this; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other115) noexcept { - BLOCK = std::move(other115.BLOCK); - __isset = other115.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other131) noexcept { + BLOCK = std::move(other131.BLOCK); + __isset = other131.__isset; return *this; } void BloomFilterAlgorithm::printTo(std::ostream& out) const { @@ -4151,18 +4390,18 @@ void swap(XxHash &a, XxHash &b) { (void) b; } -XxHash::XxHash(const XxHash& other116) noexcept { - (void) other116; +XxHash::XxHash(const XxHash& other132) noexcept { + (void) other132; } -XxHash::XxHash(XxHash&& other117) noexcept { - (void) other117; +XxHash::XxHash(XxHash&& other133) noexcept { + (void) other133; } -XxHash& XxHash::operator=(const XxHash& other118) noexcept { - (void) other118; +XxHash& XxHash::operator=(const XxHash& other134) noexcept { + (void) other134; return *this; } -XxHash& XxHash::operator=(XxHash&& other119) noexcept { - (void) other119; +XxHash& XxHash::operator=(XxHash&& other135) noexcept { + (void) other135; return *this; } void XxHash::printTo(std::ostream& out) const { @@ -4249,22 +4488,22 @@ void swap(BloomFilterHash &a, BloomFilterHash &b) { swap(a.__isset, b.__isset); } -BloomFilterHash::BloomFilterHash(const BloomFilterHash& other120) noexcept { - XXHASH = other120.XXHASH; - __isset = other120.__isset; +BloomFilterHash::BloomFilterHash(const BloomFilterHash& other136) noexcept { + XXHASH = other136.XXHASH; + __isset = other136.__isset; } -BloomFilterHash::BloomFilterHash(BloomFilterHash&& other121) noexcept { - XXHASH = std::move(other121.XXHASH); - __isset = other121.__isset; +BloomFilterHash::BloomFilterHash(BloomFilterHash&& other137) noexcept { + XXHASH = std::move(other137.XXHASH); + __isset = other137.__isset; } -BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other122) noexcept { - XXHASH = other122.XXHASH; - __isset = other122.__isset; +BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other138) noexcept { + XXHASH = other138.XXHASH; + __isset = other138.__isset; return *this; } -BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other123) noexcept { - XXHASH = std::move(other123.XXHASH); - __isset = other123.__isset; +BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other139) noexcept { + XXHASH = std::move(other139.XXHASH); + __isset = other139.__isset; return *this; } void BloomFilterHash::printTo(std::ostream& out) const { @@ -4329,18 +4568,18 @@ void swap(Uncompressed &a, Uncompressed &b) { (void) b; } -Uncompressed::Uncompressed(const Uncompressed& other124) noexcept { - (void) other124; +Uncompressed::Uncompressed(const Uncompressed& other140) noexcept { + (void) other140; } -Uncompressed::Uncompressed(Uncompressed&& other125) noexcept { - (void) other125; +Uncompressed::Uncompressed(Uncompressed&& other141) noexcept { + (void) other141; } -Uncompressed& Uncompressed::operator=(const Uncompressed& other126) noexcept { - (void) other126; +Uncompressed& Uncompressed::operator=(const Uncompressed& other142) noexcept { + (void) other142; return *this; } -Uncompressed& Uncompressed::operator=(Uncompressed&& other127) noexcept { - (void) other127; +Uncompressed& Uncompressed::operator=(Uncompressed&& other143) noexcept { + (void) other143; return *this; } void Uncompressed::printTo(std::ostream& out) const { @@ -4427,22 +4666,22 @@ void swap(BloomFilterCompression &a, BloomFilterCompression &b) { swap(a.__isset, b.__isset); } -BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other128) noexcept { - UNCOMPRESSED = other128.UNCOMPRESSED; - __isset = other128.__isset; +BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other144) noexcept { + UNCOMPRESSED = other144.UNCOMPRESSED; + __isset = other144.__isset; } -BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other129) noexcept { - UNCOMPRESSED = std::move(other129.UNCOMPRESSED); - __isset = other129.__isset; +BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other145) noexcept { + UNCOMPRESSED = std::move(other145.UNCOMPRESSED); + __isset = other145.__isset; } -BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other130) noexcept { - UNCOMPRESSED = other130.UNCOMPRESSED; - __isset = other130.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other146) noexcept { + UNCOMPRESSED = other146.UNCOMPRESSED; + __isset = other146.__isset; return *this; } -BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other131) noexcept { - UNCOMPRESSED = std::move(other131.UNCOMPRESSED); - __isset = other131.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other147) noexcept { + UNCOMPRESSED = std::move(other147.UNCOMPRESSED); + __isset = other147.__isset; return *this; } void BloomFilterCompression::printTo(std::ostream& out) const { @@ -4590,30 +4829,30 @@ void swap(BloomFilterHeader &a, BloomFilterHeader &b) { swap(a.compression, b.compression); } -BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other132) noexcept { - numBytes = other132.numBytes; - algorithm = other132.algorithm; - hash = other132.hash; - compression = other132.compression; +BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other148) noexcept { + numBytes = other148.numBytes; + algorithm = other148.algorithm; + hash = other148.hash; + compression = other148.compression; } -BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other133) noexcept { - numBytes = other133.numBytes; - algorithm = std::move(other133.algorithm); - hash = std::move(other133.hash); - compression = std::move(other133.compression); +BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other149) noexcept { + numBytes = other149.numBytes; + algorithm = std::move(other149.algorithm); + hash = std::move(other149.hash); + compression = std::move(other149.compression); } -BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other134) noexcept { - numBytes = other134.numBytes; - algorithm = other134.algorithm; - hash = other134.hash; - compression = other134.compression; +BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other150) noexcept { + numBytes = other150.numBytes; + algorithm = other150.algorithm; + hash = other150.hash; + compression = other150.compression; return *this; } -BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other135) noexcept { - numBytes = other135.numBytes; - algorithm = std::move(other135.algorithm); - hash = std::move(other135.hash); - compression = std::move(other135.compression); +BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other151) noexcept { + numBytes = other151.numBytes; + algorithm = std::move(other151.algorithm); + hash = std::move(other151.hash); + compression = std::move(other151.compression); return *this; } void BloomFilterHeader::printTo(std::ostream& out) const { @@ -4700,9 +4939,9 @@ uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast136; - xfer += iprot->readI32(ecast136); - this->type = static_cast(ecast136); + int32_t ecast152; + xfer += iprot->readI32(ecast152); + this->type = static_cast(ecast152); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -4842,50 +5081,50 @@ void swap(PageHeader &a, PageHeader &b) { swap(a.__isset, b.__isset); } -PageHeader::PageHeader(const PageHeader& other137) { - type = other137.type; - uncompressed_page_size = other137.uncompressed_page_size; - compressed_page_size = other137.compressed_page_size; - crc = other137.crc; - data_page_header = other137.data_page_header; - index_page_header = other137.index_page_header; - dictionary_page_header = other137.dictionary_page_header; - data_page_header_v2 = other137.data_page_header_v2; - __isset = other137.__isset; -} -PageHeader::PageHeader(PageHeader&& other138) noexcept { - type = other138.type; - uncompressed_page_size = other138.uncompressed_page_size; - compressed_page_size = other138.compressed_page_size; - crc = other138.crc; - data_page_header = std::move(other138.data_page_header); - index_page_header = std::move(other138.index_page_header); - dictionary_page_header = std::move(other138.dictionary_page_header); - data_page_header_v2 = std::move(other138.data_page_header_v2); - __isset = other138.__isset; -} -PageHeader& PageHeader::operator=(const PageHeader& other139) { - type = other139.type; - uncompressed_page_size = other139.uncompressed_page_size; - compressed_page_size = other139.compressed_page_size; - crc = other139.crc; - data_page_header = other139.data_page_header; - index_page_header = other139.index_page_header; - dictionary_page_header = other139.dictionary_page_header; - data_page_header_v2 = other139.data_page_header_v2; - __isset = other139.__isset; +PageHeader::PageHeader(const PageHeader& other153) { + type = other153.type; + uncompressed_page_size = other153.uncompressed_page_size; + compressed_page_size = other153.compressed_page_size; + crc = other153.crc; + data_page_header = other153.data_page_header; + index_page_header = other153.index_page_header; + dictionary_page_header = other153.dictionary_page_header; + data_page_header_v2 = other153.data_page_header_v2; + __isset = other153.__isset; +} +PageHeader::PageHeader(PageHeader&& other154) noexcept { + type = other154.type; + uncompressed_page_size = other154.uncompressed_page_size; + compressed_page_size = other154.compressed_page_size; + crc = other154.crc; + data_page_header = std::move(other154.data_page_header); + index_page_header = std::move(other154.index_page_header); + dictionary_page_header = std::move(other154.dictionary_page_header); + data_page_header_v2 = std::move(other154.data_page_header_v2); + __isset = other154.__isset; +} +PageHeader& PageHeader::operator=(const PageHeader& other155) { + type = other155.type; + uncompressed_page_size = other155.uncompressed_page_size; + compressed_page_size = other155.compressed_page_size; + crc = other155.crc; + data_page_header = other155.data_page_header; + index_page_header = other155.index_page_header; + dictionary_page_header = other155.dictionary_page_header; + data_page_header_v2 = other155.data_page_header_v2; + __isset = other155.__isset; return *this; } -PageHeader& PageHeader::operator=(PageHeader&& other140) noexcept { - type = other140.type; - uncompressed_page_size = other140.uncompressed_page_size; - compressed_page_size = other140.compressed_page_size; - crc = other140.crc; - data_page_header = std::move(other140.data_page_header); - index_page_header = std::move(other140.index_page_header); - dictionary_page_header = std::move(other140.dictionary_page_header); - data_page_header_v2 = std::move(other140.data_page_header_v2); - __isset = other140.__isset; +PageHeader& PageHeader::operator=(PageHeader&& other156) noexcept { + type = other156.type; + uncompressed_page_size = other156.uncompressed_page_size; + compressed_page_size = other156.compressed_page_size; + crc = other156.crc; + data_page_header = std::move(other156.data_page_header); + index_page_header = std::move(other156.index_page_header); + dictionary_page_header = std::move(other156.dictionary_page_header); + data_page_header_v2 = std::move(other156.data_page_header_v2); + __isset = other156.__isset; return *this; } void PageHeader::printTo(std::ostream& out) const { @@ -5000,26 +5239,26 @@ void swap(KeyValue &a, KeyValue &b) { swap(a.__isset, b.__isset); } -KeyValue::KeyValue(const KeyValue& other141) { - key = other141.key; - value = other141.value; - __isset = other141.__isset; +KeyValue::KeyValue(const KeyValue& other157) { + key = other157.key; + value = other157.value; + __isset = other157.__isset; } -KeyValue::KeyValue(KeyValue&& other142) noexcept { - key = std::move(other142.key); - value = std::move(other142.value); - __isset = other142.__isset; +KeyValue::KeyValue(KeyValue&& other158) noexcept { + key = std::move(other158.key); + value = std::move(other158.value); + __isset = other158.__isset; } -KeyValue& KeyValue::operator=(const KeyValue& other143) { - key = other143.key; - value = other143.value; - __isset = other143.__isset; +KeyValue& KeyValue::operator=(const KeyValue& other159) { + key = other159.key; + value = other159.value; + __isset = other159.__isset; return *this; } -KeyValue& KeyValue::operator=(KeyValue&& other144) noexcept { - key = std::move(other144.key); - value = std::move(other144.value); - __isset = other144.__isset; +KeyValue& KeyValue::operator=(KeyValue&& other160) noexcept { + key = std::move(other160.key); + value = std::move(other160.value); + __isset = other160.__isset; return *this; } void KeyValue::printTo(std::ostream& out) const { @@ -5148,26 +5387,26 @@ void swap(SortingColumn &a, SortingColumn &b) { swap(a.nulls_first, b.nulls_first); } -SortingColumn::SortingColumn(const SortingColumn& other145) noexcept { - column_idx = other145.column_idx; - descending = other145.descending; - nulls_first = other145.nulls_first; +SortingColumn::SortingColumn(const SortingColumn& other161) noexcept { + column_idx = other161.column_idx; + descending = other161.descending; + nulls_first = other161.nulls_first; } -SortingColumn::SortingColumn(SortingColumn&& other146) noexcept { - column_idx = other146.column_idx; - descending = other146.descending; - nulls_first = other146.nulls_first; +SortingColumn::SortingColumn(SortingColumn&& other162) noexcept { + column_idx = other162.column_idx; + descending = other162.descending; + nulls_first = other162.nulls_first; } -SortingColumn& SortingColumn::operator=(const SortingColumn& other147) noexcept { - column_idx = other147.column_idx; - descending = other147.descending; - nulls_first = other147.nulls_first; +SortingColumn& SortingColumn::operator=(const SortingColumn& other163) noexcept { + column_idx = other163.column_idx; + descending = other163.descending; + nulls_first = other163.nulls_first; return *this; } -SortingColumn& SortingColumn::operator=(SortingColumn&& other148) noexcept { - column_idx = other148.column_idx; - descending = other148.descending; - nulls_first = other148.nulls_first; +SortingColumn& SortingColumn::operator=(SortingColumn&& other164) noexcept { + column_idx = other164.column_idx; + descending = other164.descending; + nulls_first = other164.nulls_first; return *this; } void SortingColumn::printTo(std::ostream& out) const { @@ -5228,9 +5467,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast149; - xfer += iprot->readI32(ecast149); - this->page_type = static_cast(ecast149); + int32_t ecast165; + xfer += iprot->readI32(ecast165); + this->page_type = static_cast(ecast165); isset_page_type = true; } else { xfer += iprot->skip(ftype); @@ -5238,9 +5477,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast150; - xfer += iprot->readI32(ecast150); - this->encoding = static_cast(ecast150); + int32_t ecast166; + xfer += iprot->readI32(ecast166); + this->encoding = static_cast(ecast166); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -5301,26 +5540,26 @@ void swap(PageEncodingStats &a, PageEncodingStats &b) { swap(a.count, b.count); } -PageEncodingStats::PageEncodingStats(const PageEncodingStats& other151) noexcept { - page_type = other151.page_type; - encoding = other151.encoding; - count = other151.count; +PageEncodingStats::PageEncodingStats(const PageEncodingStats& other167) noexcept { + page_type = other167.page_type; + encoding = other167.encoding; + count = other167.count; } -PageEncodingStats::PageEncodingStats(PageEncodingStats&& other152) noexcept { - page_type = other152.page_type; - encoding = other152.encoding; - count = other152.count; +PageEncodingStats::PageEncodingStats(PageEncodingStats&& other168) noexcept { + page_type = other168.page_type; + encoding = other168.encoding; + count = other168.count; } -PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other153) noexcept { - page_type = other153.page_type; - encoding = other153.encoding; - count = other153.count; +PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other169) noexcept { + page_type = other169.page_type; + encoding = other169.encoding; + count = other169.count; return *this; } -PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other154) noexcept { - page_type = other154.page_type; - encoding = other154.encoding; - count = other154.count; +PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other170) noexcept { + page_type = other170.page_type; + encoding = other170.encoding; + count = other170.count; return *this; } void PageEncodingStats::printTo(std::ostream& out) const { @@ -5398,6 +5637,16 @@ void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) { this->bloom_filter_offset = val; __isset.bloom_filter_offset = true; } + +void ColumnMetaData::__set_bloom_filter_length(const int32_t val) { + this->bloom_filter_length = val; +__isset.bloom_filter_length = true; +} + +void ColumnMetaData::__set_size_statistics(const SizeStatistics& val) { + this->size_statistics = val; +__isset.size_statistics = true; +} std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj) { obj.printTo(out); @@ -5436,9 +5685,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast155; - xfer += iprot->readI32(ecast155); - this->type = static_cast(ecast155); + int32_t ecast171; + xfer += iprot->readI32(ecast171); + this->type = static_cast(ecast171); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -5448,16 +5697,16 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encodings.clear(); - uint32_t _size156; - ::apache::thrift::protocol::TType _etype159; - xfer += iprot->readListBegin(_etype159, _size156); - this->encodings.resize(_size156); - uint32_t _i160; - for (_i160 = 0; _i160 < _size156; ++_i160) + uint32_t _size172; + ::apache::thrift::protocol::TType _etype175; + xfer += iprot->readListBegin(_etype175, _size172); + this->encodings.resize(_size172); + uint32_t _i176; + for (_i176 = 0; _i176 < _size172; ++_i176) { - int32_t ecast161; - xfer += iprot->readI32(ecast161); - this->encodings[_i160] = static_cast(ecast161); + int32_t ecast177; + xfer += iprot->readI32(ecast177); + this->encodings[_i176] = static_cast(ecast177); } xfer += iprot->readListEnd(); } @@ -5470,14 +5719,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size162; - ::apache::thrift::protocol::TType _etype165; - xfer += iprot->readListBegin(_etype165, _size162); - this->path_in_schema.resize(_size162); - uint32_t _i166; - for (_i166 = 0; _i166 < _size162; ++_i166) + uint32_t _size178; + ::apache::thrift::protocol::TType _etype181; + xfer += iprot->readListBegin(_etype181, _size178); + this->path_in_schema.resize(_size178); + uint32_t _i182; + for (_i182 = 0; _i182 < _size178; ++_i182) { - xfer += iprot->readString(this->path_in_schema[_i166]); + xfer += iprot->readString(this->path_in_schema[_i182]); } xfer += iprot->readListEnd(); } @@ -5488,9 +5737,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast167; - xfer += iprot->readI32(ecast167); - this->codec = static_cast(ecast167); + int32_t ecast183; + xfer += iprot->readI32(ecast183); + this->codec = static_cast(ecast183); isset_codec = true; } else { xfer += iprot->skip(ftype); @@ -5524,14 +5773,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size168; - ::apache::thrift::protocol::TType _etype171; - xfer += iprot->readListBegin(_etype171, _size168); - this->key_value_metadata.resize(_size168); - uint32_t _i172; - for (_i172 = 0; _i172 < _size168; ++_i172) + uint32_t _size184; + ::apache::thrift::protocol::TType _etype187; + xfer += iprot->readListBegin(_etype187, _size184); + this->key_value_metadata.resize(_size184); + uint32_t _i188; + for (_i188 = 0; _i188 < _size184; ++_i188) { - xfer += this->key_value_metadata[_i172].read(iprot); + xfer += this->key_value_metadata[_i188].read(iprot); } xfer += iprot->readListEnd(); } @@ -5576,14 +5825,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encoding_stats.clear(); - uint32_t _size173; - ::apache::thrift::protocol::TType _etype176; - xfer += iprot->readListBegin(_etype176, _size173); - this->encoding_stats.resize(_size173); - uint32_t _i177; - for (_i177 = 0; _i177 < _size173; ++_i177) + uint32_t _size189; + ::apache::thrift::protocol::TType _etype192; + xfer += iprot->readListBegin(_etype192, _size189); + this->encoding_stats.resize(_size189); + uint32_t _i193; + for (_i193 = 0; _i193 < _size189; ++_i193) { - xfer += this->encoding_stats[_i177].read(iprot); + xfer += this->encoding_stats[_i193].read(iprot); } xfer += iprot->readListEnd(); } @@ -5600,6 +5849,22 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 15: + if (ftype == ::apache::thrift::protocol::T_I32) { + xfer += iprot->readI32(this->bloom_filter_length); + this->__isset.bloom_filter_length = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 16: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += this->size_statistics.read(iprot); + this->__isset.size_statistics = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -5640,10 +5905,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast(this->encodings.size())); - std::vector ::const_iterator _iter178; - for (_iter178 = this->encodings.begin(); _iter178 != this->encodings.end(); ++_iter178) + std::vector ::const_iterator _iter194; + for (_iter194 = this->encodings.begin(); _iter194 != this->encodings.end(); ++_iter194) { - xfer += oprot->writeI32(static_cast((*_iter178))); + xfer += oprot->writeI32(static_cast((*_iter194))); } xfer += oprot->writeListEnd(); } @@ -5652,10 +5917,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter179; - for (_iter179 = this->path_in_schema.begin(); _iter179 != this->path_in_schema.end(); ++_iter179) + std::vector ::const_iterator _iter195; + for (_iter195 = this->path_in_schema.begin(); _iter195 != this->path_in_schema.end(); ++_iter195) { - xfer += oprot->writeString((*_iter179)); + xfer += oprot->writeString((*_iter195)); } xfer += oprot->writeListEnd(); } @@ -5681,10 +5946,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter180; - for (_iter180 = this->key_value_metadata.begin(); _iter180 != this->key_value_metadata.end(); ++_iter180) + std::vector ::const_iterator _iter196; + for (_iter196 = this->key_value_metadata.begin(); _iter196 != this->key_value_metadata.end(); ++_iter196) { - xfer += (*_iter180).write(oprot); + xfer += (*_iter196).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5713,10 +5978,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->encoding_stats.size())); - std::vector ::const_iterator _iter181; - for (_iter181 = this->encoding_stats.begin(); _iter181 != this->encoding_stats.end(); ++_iter181) + std::vector ::const_iterator _iter197; + for (_iter197 = this->encoding_stats.begin(); _iter197 != this->encoding_stats.end(); ++_iter197) { - xfer += (*_iter181).write(oprot); + xfer += (*_iter197).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5727,6 +5992,16 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeI64(this->bloom_filter_offset); xfer += oprot->writeFieldEnd(); } + if (this->__isset.bloom_filter_length) { + xfer += oprot->writeFieldBegin("bloom_filter_length", ::apache::thrift::protocol::T_I32, 15); + xfer += oprot->writeI32(this->bloom_filter_length); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.size_statistics) { + xfer += oprot->writeFieldBegin("size_statistics", ::apache::thrift::protocol::T_STRUCT, 16); + xfer += this->size_statistics.write(oprot); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -5748,77 +6023,87 @@ void swap(ColumnMetaData &a, ColumnMetaData &b) { swap(a.statistics, b.statistics); swap(a.encoding_stats, b.encoding_stats); swap(a.bloom_filter_offset, b.bloom_filter_offset); + swap(a.bloom_filter_length, b.bloom_filter_length); + swap(a.size_statistics, b.size_statistics); swap(a.__isset, b.__isset); } -ColumnMetaData::ColumnMetaData(const ColumnMetaData& other182) { - type = other182.type; - encodings = other182.encodings; - path_in_schema = other182.path_in_schema; - codec = other182.codec; - num_values = other182.num_values; - total_uncompressed_size = other182.total_uncompressed_size; - total_compressed_size = other182.total_compressed_size; - key_value_metadata = other182.key_value_metadata; - data_page_offset = other182.data_page_offset; - index_page_offset = other182.index_page_offset; - dictionary_page_offset = other182.dictionary_page_offset; - statistics = other182.statistics; - encoding_stats = other182.encoding_stats; - bloom_filter_offset = other182.bloom_filter_offset; - __isset = other182.__isset; -} -ColumnMetaData::ColumnMetaData(ColumnMetaData&& other183) noexcept { - type = other183.type; - encodings = std::move(other183.encodings); - path_in_schema = std::move(other183.path_in_schema); - codec = other183.codec; - num_values = other183.num_values; - total_uncompressed_size = other183.total_uncompressed_size; - total_compressed_size = other183.total_compressed_size; - key_value_metadata = std::move(other183.key_value_metadata); - data_page_offset = other183.data_page_offset; - index_page_offset = other183.index_page_offset; - dictionary_page_offset = other183.dictionary_page_offset; - statistics = std::move(other183.statistics); - encoding_stats = std::move(other183.encoding_stats); - bloom_filter_offset = other183.bloom_filter_offset; - __isset = other183.__isset; -} -ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other184) { - type = other184.type; - encodings = other184.encodings; - path_in_schema = other184.path_in_schema; - codec = other184.codec; - num_values = other184.num_values; - total_uncompressed_size = other184.total_uncompressed_size; - total_compressed_size = other184.total_compressed_size; - key_value_metadata = other184.key_value_metadata; - data_page_offset = other184.data_page_offset; - index_page_offset = other184.index_page_offset; - dictionary_page_offset = other184.dictionary_page_offset; - statistics = other184.statistics; - encoding_stats = other184.encoding_stats; - bloom_filter_offset = other184.bloom_filter_offset; - __isset = other184.__isset; +ColumnMetaData::ColumnMetaData(const ColumnMetaData& other198) { + type = other198.type; + encodings = other198.encodings; + path_in_schema = other198.path_in_schema; + codec = other198.codec; + num_values = other198.num_values; + total_uncompressed_size = other198.total_uncompressed_size; + total_compressed_size = other198.total_compressed_size; + key_value_metadata = other198.key_value_metadata; + data_page_offset = other198.data_page_offset; + index_page_offset = other198.index_page_offset; + dictionary_page_offset = other198.dictionary_page_offset; + statistics = other198.statistics; + encoding_stats = other198.encoding_stats; + bloom_filter_offset = other198.bloom_filter_offset; + bloom_filter_length = other198.bloom_filter_length; + size_statistics = other198.size_statistics; + __isset = other198.__isset; +} +ColumnMetaData::ColumnMetaData(ColumnMetaData&& other199) noexcept { + type = other199.type; + encodings = std::move(other199.encodings); + path_in_schema = std::move(other199.path_in_schema); + codec = other199.codec; + num_values = other199.num_values; + total_uncompressed_size = other199.total_uncompressed_size; + total_compressed_size = other199.total_compressed_size; + key_value_metadata = std::move(other199.key_value_metadata); + data_page_offset = other199.data_page_offset; + index_page_offset = other199.index_page_offset; + dictionary_page_offset = other199.dictionary_page_offset; + statistics = std::move(other199.statistics); + encoding_stats = std::move(other199.encoding_stats); + bloom_filter_offset = other199.bloom_filter_offset; + bloom_filter_length = other199.bloom_filter_length; + size_statistics = std::move(other199.size_statistics); + __isset = other199.__isset; +} +ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other200) { + type = other200.type; + encodings = other200.encodings; + path_in_schema = other200.path_in_schema; + codec = other200.codec; + num_values = other200.num_values; + total_uncompressed_size = other200.total_uncompressed_size; + total_compressed_size = other200.total_compressed_size; + key_value_metadata = other200.key_value_metadata; + data_page_offset = other200.data_page_offset; + index_page_offset = other200.index_page_offset; + dictionary_page_offset = other200.dictionary_page_offset; + statistics = other200.statistics; + encoding_stats = other200.encoding_stats; + bloom_filter_offset = other200.bloom_filter_offset; + bloom_filter_length = other200.bloom_filter_length; + size_statistics = other200.size_statistics; + __isset = other200.__isset; return *this; } -ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other185) noexcept { - type = other185.type; - encodings = std::move(other185.encodings); - path_in_schema = std::move(other185.path_in_schema); - codec = other185.codec; - num_values = other185.num_values; - total_uncompressed_size = other185.total_uncompressed_size; - total_compressed_size = other185.total_compressed_size; - key_value_metadata = std::move(other185.key_value_metadata); - data_page_offset = other185.data_page_offset; - index_page_offset = other185.index_page_offset; - dictionary_page_offset = other185.dictionary_page_offset; - statistics = std::move(other185.statistics); - encoding_stats = std::move(other185.encoding_stats); - bloom_filter_offset = other185.bloom_filter_offset; - __isset = other185.__isset; +ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other201) noexcept { + type = other201.type; + encodings = std::move(other201.encodings); + path_in_schema = std::move(other201.path_in_schema); + codec = other201.codec; + num_values = other201.num_values; + total_uncompressed_size = other201.total_uncompressed_size; + total_compressed_size = other201.total_compressed_size; + key_value_metadata = std::move(other201.key_value_metadata); + data_page_offset = other201.data_page_offset; + index_page_offset = other201.index_page_offset; + dictionary_page_offset = other201.dictionary_page_offset; + statistics = std::move(other201.statistics); + encoding_stats = std::move(other201.encoding_stats); + bloom_filter_offset = other201.bloom_filter_offset; + bloom_filter_length = other201.bloom_filter_length; + size_statistics = std::move(other201.size_statistics); + __isset = other201.__isset; return *this; } void ColumnMetaData::printTo(std::ostream& out) const { @@ -5838,6 +6123,8 @@ void ColumnMetaData::printTo(std::ostream& out) const { out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "")); out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "")); out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "")); + out << ", " << "bloom_filter_length="; (__isset.bloom_filter_length ? (out << to_string(bloom_filter_length)) : (out << "")); + out << ", " << "size_statistics="; (__isset.size_statistics ? (out << to_string(size_statistics)) : (out << "")); out << ")"; } @@ -5896,18 +6183,18 @@ void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) { (void) b; } -EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other186) noexcept { - (void) other186; +EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other202) noexcept { + (void) other202; } -EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other187) noexcept { - (void) other187; +EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other203) noexcept { + (void) other203; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other188) noexcept { - (void) other188; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other204) noexcept { + (void) other204; return *this; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other189) noexcept { - (void) other189; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other205) noexcept { + (void) other205; return *this; } void EncryptionWithFooterKey::printTo(std::ostream& out) const { @@ -5962,14 +6249,14 @@ uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* ip if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size190; - ::apache::thrift::protocol::TType _etype193; - xfer += iprot->readListBegin(_etype193, _size190); - this->path_in_schema.resize(_size190); - uint32_t _i194; - for (_i194 = 0; _i194 < _size190; ++_i194) + uint32_t _size206; + ::apache::thrift::protocol::TType _etype209; + xfer += iprot->readListBegin(_etype209, _size206); + this->path_in_schema.resize(_size206); + uint32_t _i210; + for (_i210 = 0; _i210 < _size206; ++_i210) { - xfer += iprot->readString(this->path_in_schema[_i194]); + xfer += iprot->readString(this->path_in_schema[_i210]); } xfer += iprot->readListEnd(); } @@ -6008,10 +6295,10 @@ uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* o xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter195; - for (_iter195 = this->path_in_schema.begin(); _iter195 != this->path_in_schema.end(); ++_iter195) + std::vector ::const_iterator _iter211; + for (_iter211 = this->path_in_schema.begin(); _iter211 != this->path_in_schema.end(); ++_iter211) { - xfer += oprot->writeString((*_iter195)); + xfer += oprot->writeString((*_iter211)); } xfer += oprot->writeListEnd(); } @@ -6034,26 +6321,26 @@ void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) { swap(a.__isset, b.__isset); } -EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other196) { - path_in_schema = other196.path_in_schema; - key_metadata = other196.key_metadata; - __isset = other196.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other212) { + path_in_schema = other212.path_in_schema; + key_metadata = other212.key_metadata; + __isset = other212.__isset; } -EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other197) noexcept { - path_in_schema = std::move(other197.path_in_schema); - key_metadata = std::move(other197.key_metadata); - __isset = other197.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other213) noexcept { + path_in_schema = std::move(other213.path_in_schema); + key_metadata = std::move(other213.key_metadata); + __isset = other213.__isset; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other198) { - path_in_schema = other198.path_in_schema; - key_metadata = other198.key_metadata; - __isset = other198.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other214) { + path_in_schema = other214.path_in_schema; + key_metadata = other214.key_metadata; + __isset = other214.__isset; return *this; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other199) noexcept { - path_in_schema = std::move(other199.path_in_schema); - key_metadata = std::move(other199.key_metadata); - __isset = other199.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other215) noexcept { + path_in_schema = std::move(other215.path_in_schema); + key_metadata = std::move(other215.key_metadata); + __isset = other215.__isset; return *this; } void EncryptionWithColumnKey::printTo(std::ostream& out) const { @@ -6161,26 +6448,26 @@ void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) { swap(a.__isset, b.__isset); } -ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other200) { - ENCRYPTION_WITH_FOOTER_KEY = other200.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other200.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other200.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other216) { + ENCRYPTION_WITH_FOOTER_KEY = other216.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other216.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other216.__isset; } -ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other201) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other201.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other201.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other201.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other217) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other217.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other217.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other217.__isset; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other202) { - ENCRYPTION_WITH_FOOTER_KEY = other202.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other202.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other202.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other218) { + ENCRYPTION_WITH_FOOTER_KEY = other218.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other218.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other218.__isset; return *this; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other203) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other203.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other203.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other203.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other219) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other219.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other219.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other219.__isset; return *this; } void ColumnCryptoMetaData::printTo(std::ostream& out) const { @@ -6422,54 +6709,54 @@ void swap(ColumnChunk &a, ColumnChunk &b) { swap(a.__isset, b.__isset); } -ColumnChunk::ColumnChunk(const ColumnChunk& other204) { - file_path = other204.file_path; - file_offset = other204.file_offset; - meta_data = other204.meta_data; - offset_index_offset = other204.offset_index_offset; - offset_index_length = other204.offset_index_length; - column_index_offset = other204.column_index_offset; - column_index_length = other204.column_index_length; - crypto_metadata = other204.crypto_metadata; - encrypted_column_metadata = other204.encrypted_column_metadata; - __isset = other204.__isset; -} -ColumnChunk::ColumnChunk(ColumnChunk&& other205) noexcept { - file_path = std::move(other205.file_path); - file_offset = other205.file_offset; - meta_data = std::move(other205.meta_data); - offset_index_offset = other205.offset_index_offset; - offset_index_length = other205.offset_index_length; - column_index_offset = other205.column_index_offset; - column_index_length = other205.column_index_length; - crypto_metadata = std::move(other205.crypto_metadata); - encrypted_column_metadata = std::move(other205.encrypted_column_metadata); - __isset = other205.__isset; -} -ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other206) { - file_path = other206.file_path; - file_offset = other206.file_offset; - meta_data = other206.meta_data; - offset_index_offset = other206.offset_index_offset; - offset_index_length = other206.offset_index_length; - column_index_offset = other206.column_index_offset; - column_index_length = other206.column_index_length; - crypto_metadata = other206.crypto_metadata; - encrypted_column_metadata = other206.encrypted_column_metadata; - __isset = other206.__isset; +ColumnChunk::ColumnChunk(const ColumnChunk& other220) { + file_path = other220.file_path; + file_offset = other220.file_offset; + meta_data = other220.meta_data; + offset_index_offset = other220.offset_index_offset; + offset_index_length = other220.offset_index_length; + column_index_offset = other220.column_index_offset; + column_index_length = other220.column_index_length; + crypto_metadata = other220.crypto_metadata; + encrypted_column_metadata = other220.encrypted_column_metadata; + __isset = other220.__isset; +} +ColumnChunk::ColumnChunk(ColumnChunk&& other221) noexcept { + file_path = std::move(other221.file_path); + file_offset = other221.file_offset; + meta_data = std::move(other221.meta_data); + offset_index_offset = other221.offset_index_offset; + offset_index_length = other221.offset_index_length; + column_index_offset = other221.column_index_offset; + column_index_length = other221.column_index_length; + crypto_metadata = std::move(other221.crypto_metadata); + encrypted_column_metadata = std::move(other221.encrypted_column_metadata); + __isset = other221.__isset; +} +ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other222) { + file_path = other222.file_path; + file_offset = other222.file_offset; + meta_data = other222.meta_data; + offset_index_offset = other222.offset_index_offset; + offset_index_length = other222.offset_index_length; + column_index_offset = other222.column_index_offset; + column_index_length = other222.column_index_length; + crypto_metadata = other222.crypto_metadata; + encrypted_column_metadata = other222.encrypted_column_metadata; + __isset = other222.__isset; return *this; } -ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other207) noexcept { - file_path = std::move(other207.file_path); - file_offset = other207.file_offset; - meta_data = std::move(other207.meta_data); - offset_index_offset = other207.offset_index_offset; - offset_index_length = other207.offset_index_length; - column_index_offset = other207.column_index_offset; - column_index_length = other207.column_index_length; - crypto_metadata = std::move(other207.crypto_metadata); - encrypted_column_metadata = std::move(other207.encrypted_column_metadata); - __isset = other207.__isset; +ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other223) noexcept { + file_path = std::move(other223.file_path); + file_offset = other223.file_offset; + meta_data = std::move(other223.meta_data); + offset_index_offset = other223.offset_index_offset; + offset_index_length = other223.offset_index_length; + column_index_offset = other223.column_index_offset; + column_index_length = other223.column_index_length; + crypto_metadata = std::move(other223.crypto_metadata); + encrypted_column_metadata = std::move(other223.encrypted_column_metadata); + __isset = other223.__isset; return *this; } void ColumnChunk::printTo(std::ostream& out) const { @@ -6558,14 +6845,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->columns.clear(); - uint32_t _size208; - ::apache::thrift::protocol::TType _etype211; - xfer += iprot->readListBegin(_etype211, _size208); - this->columns.resize(_size208); - uint32_t _i212; - for (_i212 = 0; _i212 < _size208; ++_i212) + uint32_t _size224; + ::apache::thrift::protocol::TType _etype227; + xfer += iprot->readListBegin(_etype227, _size224); + this->columns.resize(_size224); + uint32_t _i228; + for (_i228 = 0; _i228 < _size224; ++_i228) { - xfer += this->columns[_i212].read(iprot); + xfer += this->columns[_i228].read(iprot); } xfer += iprot->readListEnd(); } @@ -6594,14 +6881,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->sorting_columns.clear(); - uint32_t _size213; - ::apache::thrift::protocol::TType _etype216; - xfer += iprot->readListBegin(_etype216, _size213); - this->sorting_columns.resize(_size213); - uint32_t _i217; - for (_i217 = 0; _i217 < _size213; ++_i217) + uint32_t _size229; + ::apache::thrift::protocol::TType _etype232; + xfer += iprot->readListBegin(_etype232, _size229); + this->sorting_columns.resize(_size229); + uint32_t _i233; + for (_i233 = 0; _i233 < _size229; ++_i233) { - xfer += this->sorting_columns[_i217].read(iprot); + xfer += this->sorting_columns[_i233].read(iprot); } xfer += iprot->readListEnd(); } @@ -6660,10 +6947,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->columns.size())); - std::vector ::const_iterator _iter218; - for (_iter218 = this->columns.begin(); _iter218 != this->columns.end(); ++_iter218) + std::vector ::const_iterator _iter234; + for (_iter234 = this->columns.begin(); _iter234 != this->columns.end(); ++_iter234) { - xfer += (*_iter218).write(oprot); + xfer += (*_iter234).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6681,10 +6968,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->sorting_columns.size())); - std::vector ::const_iterator _iter219; - for (_iter219 = this->sorting_columns.begin(); _iter219 != this->sorting_columns.end(); ++_iter219) + std::vector ::const_iterator _iter235; + for (_iter235 = this->sorting_columns.begin(); _iter235 != this->sorting_columns.end(); ++_iter235) { - xfer += (*_iter219).write(oprot); + xfer += (*_iter235).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6722,46 +7009,46 @@ void swap(RowGroup &a, RowGroup &b) { swap(a.__isset, b.__isset); } -RowGroup::RowGroup(const RowGroup& other220) { - columns = other220.columns; - total_byte_size = other220.total_byte_size; - num_rows = other220.num_rows; - sorting_columns = other220.sorting_columns; - file_offset = other220.file_offset; - total_compressed_size = other220.total_compressed_size; - ordinal = other220.ordinal; - __isset = other220.__isset; -} -RowGroup::RowGroup(RowGroup&& other221) noexcept { - columns = std::move(other221.columns); - total_byte_size = other221.total_byte_size; - num_rows = other221.num_rows; - sorting_columns = std::move(other221.sorting_columns); - file_offset = other221.file_offset; - total_compressed_size = other221.total_compressed_size; - ordinal = other221.ordinal; - __isset = other221.__isset; -} -RowGroup& RowGroup::operator=(const RowGroup& other222) { - columns = other222.columns; - total_byte_size = other222.total_byte_size; - num_rows = other222.num_rows; - sorting_columns = other222.sorting_columns; - file_offset = other222.file_offset; - total_compressed_size = other222.total_compressed_size; - ordinal = other222.ordinal; - __isset = other222.__isset; +RowGroup::RowGroup(const RowGroup& other236) { + columns = other236.columns; + total_byte_size = other236.total_byte_size; + num_rows = other236.num_rows; + sorting_columns = other236.sorting_columns; + file_offset = other236.file_offset; + total_compressed_size = other236.total_compressed_size; + ordinal = other236.ordinal; + __isset = other236.__isset; +} +RowGroup::RowGroup(RowGroup&& other237) noexcept { + columns = std::move(other237.columns); + total_byte_size = other237.total_byte_size; + num_rows = other237.num_rows; + sorting_columns = std::move(other237.sorting_columns); + file_offset = other237.file_offset; + total_compressed_size = other237.total_compressed_size; + ordinal = other237.ordinal; + __isset = other237.__isset; +} +RowGroup& RowGroup::operator=(const RowGroup& other238) { + columns = other238.columns; + total_byte_size = other238.total_byte_size; + num_rows = other238.num_rows; + sorting_columns = other238.sorting_columns; + file_offset = other238.file_offset; + total_compressed_size = other238.total_compressed_size; + ordinal = other238.ordinal; + __isset = other238.__isset; return *this; } -RowGroup& RowGroup::operator=(RowGroup&& other223) noexcept { - columns = std::move(other223.columns); - total_byte_size = other223.total_byte_size; - num_rows = other223.num_rows; - sorting_columns = std::move(other223.sorting_columns); - file_offset = other223.file_offset; - total_compressed_size = other223.total_compressed_size; - ordinal = other223.ordinal; - __isset = other223.__isset; +RowGroup& RowGroup::operator=(RowGroup&& other239) noexcept { + columns = std::move(other239.columns); + total_byte_size = other239.total_byte_size; + num_rows = other239.num_rows; + sorting_columns = std::move(other239.sorting_columns); + file_offset = other239.file_offset; + total_compressed_size = other239.total_compressed_size; + ordinal = other239.ordinal; + __isset = other239.__isset; return *this; } void RowGroup::printTo(std::ostream& out) const { @@ -6832,18 +7119,18 @@ void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) { (void) b; } -TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other224) noexcept { - (void) other224; +TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other240) noexcept { + (void) other240; } -TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other225) noexcept { - (void) other225; +TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other241) noexcept { + (void) other241; } -TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other226) noexcept { - (void) other226; +TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other242) noexcept { + (void) other242; return *this; } -TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other227) noexcept { - (void) other227; +TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other243) noexcept { + (void) other243; return *this; } void TypeDefinedOrder::printTo(std::ostream& out) const { @@ -6930,22 +7217,22 @@ void swap(ColumnOrder &a, ColumnOrder &b) { swap(a.__isset, b.__isset); } -ColumnOrder::ColumnOrder(const ColumnOrder& other228) noexcept { - TYPE_ORDER = other228.TYPE_ORDER; - __isset = other228.__isset; +ColumnOrder::ColumnOrder(const ColumnOrder& other244) noexcept { + TYPE_ORDER = other244.TYPE_ORDER; + __isset = other244.__isset; } -ColumnOrder::ColumnOrder(ColumnOrder&& other229) noexcept { - TYPE_ORDER = std::move(other229.TYPE_ORDER); - __isset = other229.__isset; +ColumnOrder::ColumnOrder(ColumnOrder&& other245) noexcept { + TYPE_ORDER = std::move(other245.TYPE_ORDER); + __isset = other245.__isset; } -ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other230) noexcept { - TYPE_ORDER = other230.TYPE_ORDER; - __isset = other230.__isset; +ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other246) noexcept { + TYPE_ORDER = other246.TYPE_ORDER; + __isset = other246.__isset; return *this; } -ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other231) noexcept { - TYPE_ORDER = std::move(other231.TYPE_ORDER); - __isset = other231.__isset; +ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other247) noexcept { + TYPE_ORDER = std::move(other247.TYPE_ORDER); + __isset = other247.__isset; return *this; } void ColumnOrder::printTo(std::ostream& out) const { @@ -7073,26 +7360,26 @@ void swap(PageLocation &a, PageLocation &b) { swap(a.first_row_index, b.first_row_index); } -PageLocation::PageLocation(const PageLocation& other232) noexcept { - offset = other232.offset; - compressed_page_size = other232.compressed_page_size; - first_row_index = other232.first_row_index; +PageLocation::PageLocation(const PageLocation& other248) noexcept { + offset = other248.offset; + compressed_page_size = other248.compressed_page_size; + first_row_index = other248.first_row_index; } -PageLocation::PageLocation(PageLocation&& other233) noexcept { - offset = other233.offset; - compressed_page_size = other233.compressed_page_size; - first_row_index = other233.first_row_index; +PageLocation::PageLocation(PageLocation&& other249) noexcept { + offset = other249.offset; + compressed_page_size = other249.compressed_page_size; + first_row_index = other249.first_row_index; } -PageLocation& PageLocation::operator=(const PageLocation& other234) noexcept { - offset = other234.offset; - compressed_page_size = other234.compressed_page_size; - first_row_index = other234.first_row_index; +PageLocation& PageLocation::operator=(const PageLocation& other250) noexcept { + offset = other250.offset; + compressed_page_size = other250.compressed_page_size; + first_row_index = other250.first_row_index; return *this; } -PageLocation& PageLocation::operator=(PageLocation&& other235) noexcept { - offset = other235.offset; - compressed_page_size = other235.compressed_page_size; - first_row_index = other235.first_row_index; +PageLocation& PageLocation::operator=(PageLocation&& other251) noexcept { + offset = other251.offset; + compressed_page_size = other251.compressed_page_size; + first_row_index = other251.first_row_index; return *this; } void PageLocation::printTo(std::ostream& out) const { @@ -7112,6 +7399,11 @@ OffsetIndex::~OffsetIndex() noexcept { void OffsetIndex::__set_page_locations(const std::vector & val) { this->page_locations = val; } + +void OffsetIndex::__set_unencoded_byte_array_data_bytes(const std::vector & val) { + this->unencoded_byte_array_data_bytes = val; +__isset.unencoded_byte_array_data_bytes = true; +} std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj) { obj.printTo(out); @@ -7145,14 +7437,14 @@ uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->page_locations.clear(); - uint32_t _size236; - ::apache::thrift::protocol::TType _etype239; - xfer += iprot->readListBegin(_etype239, _size236); - this->page_locations.resize(_size236); - uint32_t _i240; - for (_i240 = 0; _i240 < _size236; ++_i240) + uint32_t _size252; + ::apache::thrift::protocol::TType _etype255; + xfer += iprot->readListBegin(_etype255, _size252); + this->page_locations.resize(_size252); + uint32_t _i256; + for (_i256 = 0; _i256 < _size252; ++_i256) { - xfer += this->page_locations[_i240].read(iprot); + xfer += this->page_locations[_i256].read(iprot); } xfer += iprot->readListEnd(); } @@ -7161,6 +7453,26 @@ uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 2: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->unencoded_byte_array_data_bytes.clear(); + uint32_t _size257; + ::apache::thrift::protocol::TType _etype260; + xfer += iprot->readListBegin(_etype260, _size257); + this->unencoded_byte_array_data_bytes.resize(_size257); + uint32_t _i261; + for (_i261 = 0; _i261 < _size257; ++_i261) + { + xfer += iprot->readI64(this->unencoded_byte_array_data_bytes[_i261]); + } + xfer += iprot->readListEnd(); + } + this->__isset.unencoded_byte_array_data_bytes = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -7183,15 +7495,28 @@ uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->page_locations.size())); - std::vector ::const_iterator _iter241; - for (_iter241 = this->page_locations.begin(); _iter241 != this->page_locations.end(); ++_iter241) + std::vector ::const_iterator _iter262; + for (_iter262 = this->page_locations.begin(); _iter262 != this->page_locations.end(); ++_iter262) { - xfer += (*_iter241).write(oprot); + xfer += (*_iter262).write(oprot); } xfer += oprot->writeListEnd(); } xfer += oprot->writeFieldEnd(); + if (this->__isset.unencoded_byte_array_data_bytes) { + xfer += oprot->writeFieldBegin("unencoded_byte_array_data_bytes", ::apache::thrift::protocol::T_LIST, 2); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->unencoded_byte_array_data_bytes.size())); + std::vector ::const_iterator _iter263; + for (_iter263 = this->unencoded_byte_array_data_bytes.begin(); _iter263 != this->unencoded_byte_array_data_bytes.end(); ++_iter263) + { + xfer += oprot->writeI64((*_iter263)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -7200,26 +7525,37 @@ uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const void swap(OffsetIndex &a, OffsetIndex &b) { using ::std::swap; swap(a.page_locations, b.page_locations); + swap(a.unencoded_byte_array_data_bytes, b.unencoded_byte_array_data_bytes); + swap(a.__isset, b.__isset); } -OffsetIndex::OffsetIndex(const OffsetIndex& other242) { - page_locations = other242.page_locations; +OffsetIndex::OffsetIndex(const OffsetIndex& other264) { + page_locations = other264.page_locations; + unencoded_byte_array_data_bytes = other264.unencoded_byte_array_data_bytes; + __isset = other264.__isset; } -OffsetIndex::OffsetIndex(OffsetIndex&& other243) noexcept { - page_locations = std::move(other243.page_locations); +OffsetIndex::OffsetIndex(OffsetIndex&& other265) noexcept { + page_locations = std::move(other265.page_locations); + unencoded_byte_array_data_bytes = std::move(other265.unencoded_byte_array_data_bytes); + __isset = other265.__isset; } -OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other244) { - page_locations = other244.page_locations; +OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other266) { + page_locations = other266.page_locations; + unencoded_byte_array_data_bytes = other266.unencoded_byte_array_data_bytes; + __isset = other266.__isset; return *this; } -OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other245) noexcept { - page_locations = std::move(other245.page_locations); +OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other267) noexcept { + page_locations = std::move(other267.page_locations); + unencoded_byte_array_data_bytes = std::move(other267.unencoded_byte_array_data_bytes); + __isset = other267.__isset; return *this; } void OffsetIndex::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "OffsetIndex("; out << "page_locations=" << to_string(page_locations); + out << ", " << "unencoded_byte_array_data_bytes="; (__isset.unencoded_byte_array_data_bytes ? (out << to_string(unencoded_byte_array_data_bytes)) : (out << "")); out << ")"; } @@ -7248,6 +7584,16 @@ void ColumnIndex::__set_null_counts(const std::vector & val) { this->null_counts = val; __isset.null_counts = true; } + +void ColumnIndex::__set_repetition_level_histograms(const std::vector & val) { + this->repetition_level_histograms = val; +__isset.repetition_level_histograms = true; +} + +void ColumnIndex::__set_definition_level_histograms(const std::vector & val) { + this->definition_level_histograms = val; +__isset.definition_level_histograms = true; +} std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj) { obj.printTo(out); @@ -7284,14 +7630,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_pages.clear(); - uint32_t _size246; - ::apache::thrift::protocol::TType _etype249; - xfer += iprot->readListBegin(_etype249, _size246); - this->null_pages.resize(_size246); - uint32_t _i250; - for (_i250 = 0; _i250 < _size246; ++_i250) + uint32_t _size268; + ::apache::thrift::protocol::TType _etype271; + xfer += iprot->readListBegin(_etype271, _size268); + this->null_pages.resize(_size268); + uint32_t _i272; + for (_i272 = 0; _i272 < _size268; ++_i272) { - xfer += iprot->readBool(this->null_pages[_i250]); + xfer += iprot->readBool(this->null_pages[_i272]); } xfer += iprot->readListEnd(); } @@ -7304,14 +7650,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->min_values.clear(); - uint32_t _size251; - ::apache::thrift::protocol::TType _etype254; - xfer += iprot->readListBegin(_etype254, _size251); - this->min_values.resize(_size251); - uint32_t _i255; - for (_i255 = 0; _i255 < _size251; ++_i255) + uint32_t _size273; + ::apache::thrift::protocol::TType _etype276; + xfer += iprot->readListBegin(_etype276, _size273); + this->min_values.resize(_size273); + uint32_t _i277; + for (_i277 = 0; _i277 < _size273; ++_i277) { - xfer += iprot->readBinary(this->min_values[_i255]); + xfer += iprot->readBinary(this->min_values[_i277]); } xfer += iprot->readListEnd(); } @@ -7324,14 +7670,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->max_values.clear(); - uint32_t _size256; - ::apache::thrift::protocol::TType _etype259; - xfer += iprot->readListBegin(_etype259, _size256); - this->max_values.resize(_size256); - uint32_t _i260; - for (_i260 = 0; _i260 < _size256; ++_i260) + uint32_t _size278; + ::apache::thrift::protocol::TType _etype281; + xfer += iprot->readListBegin(_etype281, _size278); + this->max_values.resize(_size278); + uint32_t _i282; + for (_i282 = 0; _i282 < _size278; ++_i282) { - xfer += iprot->readBinary(this->max_values[_i260]); + xfer += iprot->readBinary(this->max_values[_i282]); } xfer += iprot->readListEnd(); } @@ -7342,9 +7688,9 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast261; - xfer += iprot->readI32(ecast261); - this->boundary_order = static_cast(ecast261); + int32_t ecast283; + xfer += iprot->readI32(ecast283); + this->boundary_order = static_cast(ecast283); isset_boundary_order = true; } else { xfer += iprot->skip(ftype); @@ -7354,14 +7700,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_counts.clear(); - uint32_t _size262; - ::apache::thrift::protocol::TType _etype265; - xfer += iprot->readListBegin(_etype265, _size262); - this->null_counts.resize(_size262); - uint32_t _i266; - for (_i266 = 0; _i266 < _size262; ++_i266) + uint32_t _size284; + ::apache::thrift::protocol::TType _etype287; + xfer += iprot->readListBegin(_etype287, _size284); + this->null_counts.resize(_size284); + uint32_t _i288; + for (_i288 = 0; _i288 < _size284; ++_i288) { - xfer += iprot->readI64(this->null_counts[_i266]); + xfer += iprot->readI64(this->null_counts[_i288]); } xfer += iprot->readListEnd(); } @@ -7370,6 +7716,46 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 6: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->repetition_level_histograms.clear(); + uint32_t _size289; + ::apache::thrift::protocol::TType _etype292; + xfer += iprot->readListBegin(_etype292, _size289); + this->repetition_level_histograms.resize(_size289); + uint32_t _i293; + for (_i293 = 0; _i293 < _size289; ++_i293) + { + xfer += iprot->readI64(this->repetition_level_histograms[_i293]); + } + xfer += iprot->readListEnd(); + } + this->__isset.repetition_level_histograms = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 7: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->definition_level_histograms.clear(); + uint32_t _size294; + ::apache::thrift::protocol::TType _etype297; + xfer += iprot->readListBegin(_etype297, _size294); + this->definition_level_histograms.resize(_size294); + uint32_t _i298; + for (_i298 = 0; _i298 < _size294; ++_i298) + { + xfer += iprot->readI64(this->definition_level_histograms[_i298]); + } + xfer += iprot->readListEnd(); + } + this->__isset.definition_level_histograms = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -7398,10 +7784,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast(this->null_pages.size())); - std::vector ::const_iterator _iter267; - for (_iter267 = this->null_pages.begin(); _iter267 != this->null_pages.end(); ++_iter267) + std::vector ::const_iterator _iter299; + for (_iter299 = this->null_pages.begin(); _iter299 != this->null_pages.end(); ++_iter299) { - xfer += oprot->writeBool((*_iter267)); + xfer += oprot->writeBool((*_iter299)); } xfer += oprot->writeListEnd(); } @@ -7410,10 +7796,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->min_values.size())); - std::vector ::const_iterator _iter268; - for (_iter268 = this->min_values.begin(); _iter268 != this->min_values.end(); ++_iter268) + std::vector ::const_iterator _iter300; + for (_iter300 = this->min_values.begin(); _iter300 != this->min_values.end(); ++_iter300) { - xfer += oprot->writeBinary((*_iter268)); + xfer += oprot->writeBinary((*_iter300)); } xfer += oprot->writeListEnd(); } @@ -7422,10 +7808,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->max_values.size())); - std::vector ::const_iterator _iter269; - for (_iter269 = this->max_values.begin(); _iter269 != this->max_values.end(); ++_iter269) + std::vector ::const_iterator _iter301; + for (_iter301 = this->max_values.begin(); _iter301 != this->max_values.end(); ++_iter301) { - xfer += oprot->writeBinary((*_iter269)); + xfer += oprot->writeBinary((*_iter301)); } xfer += oprot->writeListEnd(); } @@ -7439,10 +7825,36 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->null_counts.size())); - std::vector ::const_iterator _iter270; - for (_iter270 = this->null_counts.begin(); _iter270 != this->null_counts.end(); ++_iter270) + std::vector ::const_iterator _iter302; + for (_iter302 = this->null_counts.begin(); _iter302 != this->null_counts.end(); ++_iter302) + { + xfer += oprot->writeI64((*_iter302)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.repetition_level_histograms) { + xfer += oprot->writeFieldBegin("repetition_level_histograms", ::apache::thrift::protocol::T_LIST, 6); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->repetition_level_histograms.size())); + std::vector ::const_iterator _iter303; + for (_iter303 = this->repetition_level_histograms.begin(); _iter303 != this->repetition_level_histograms.end(); ++_iter303) { - xfer += oprot->writeI64((*_iter270)); + xfer += oprot->writeI64((*_iter303)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.definition_level_histograms) { + xfer += oprot->writeFieldBegin("definition_level_histograms", ::apache::thrift::protocol::T_LIST, 7); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->definition_level_histograms.size())); + std::vector ::const_iterator _iter304; + for (_iter304 = this->definition_level_histograms.begin(); _iter304 != this->definition_level_histograms.end(); ++_iter304) + { + xfer += oprot->writeI64((*_iter304)); } xfer += oprot->writeListEnd(); } @@ -7460,41 +7872,51 @@ void swap(ColumnIndex &a, ColumnIndex &b) { swap(a.max_values, b.max_values); swap(a.boundary_order, b.boundary_order); swap(a.null_counts, b.null_counts); + swap(a.repetition_level_histograms, b.repetition_level_histograms); + swap(a.definition_level_histograms, b.definition_level_histograms); swap(a.__isset, b.__isset); } -ColumnIndex::ColumnIndex(const ColumnIndex& other271) { - null_pages = other271.null_pages; - min_values = other271.min_values; - max_values = other271.max_values; - boundary_order = other271.boundary_order; - null_counts = other271.null_counts; - __isset = other271.__isset; -} -ColumnIndex::ColumnIndex(ColumnIndex&& other272) noexcept { - null_pages = std::move(other272.null_pages); - min_values = std::move(other272.min_values); - max_values = std::move(other272.max_values); - boundary_order = other272.boundary_order; - null_counts = std::move(other272.null_counts); - __isset = other272.__isset; -} -ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other273) { - null_pages = other273.null_pages; - min_values = other273.min_values; - max_values = other273.max_values; - boundary_order = other273.boundary_order; - null_counts = other273.null_counts; - __isset = other273.__isset; +ColumnIndex::ColumnIndex(const ColumnIndex& other305) { + null_pages = other305.null_pages; + min_values = other305.min_values; + max_values = other305.max_values; + boundary_order = other305.boundary_order; + null_counts = other305.null_counts; + repetition_level_histograms = other305.repetition_level_histograms; + definition_level_histograms = other305.definition_level_histograms; + __isset = other305.__isset; +} +ColumnIndex::ColumnIndex(ColumnIndex&& other306) noexcept { + null_pages = std::move(other306.null_pages); + min_values = std::move(other306.min_values); + max_values = std::move(other306.max_values); + boundary_order = other306.boundary_order; + null_counts = std::move(other306.null_counts); + repetition_level_histograms = std::move(other306.repetition_level_histograms); + definition_level_histograms = std::move(other306.definition_level_histograms); + __isset = other306.__isset; +} +ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other307) { + null_pages = other307.null_pages; + min_values = other307.min_values; + max_values = other307.max_values; + boundary_order = other307.boundary_order; + null_counts = other307.null_counts; + repetition_level_histograms = other307.repetition_level_histograms; + definition_level_histograms = other307.definition_level_histograms; + __isset = other307.__isset; return *this; } -ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other274) noexcept { - null_pages = std::move(other274.null_pages); - min_values = std::move(other274.min_values); - max_values = std::move(other274.max_values); - boundary_order = other274.boundary_order; - null_counts = std::move(other274.null_counts); - __isset = other274.__isset; +ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other308) noexcept { + null_pages = std::move(other308.null_pages); + min_values = std::move(other308.min_values); + max_values = std::move(other308.max_values); + boundary_order = other308.boundary_order; + null_counts = std::move(other308.null_counts); + repetition_level_histograms = std::move(other308.repetition_level_histograms); + definition_level_histograms = std::move(other308.definition_level_histograms); + __isset = other308.__isset; return *this; } void ColumnIndex::printTo(std::ostream& out) const { @@ -7505,6 +7927,8 @@ void ColumnIndex::printTo(std::ostream& out) const { out << ", " << "max_values=" << to_string(max_values); out << ", " << "boundary_order=" << to_string(boundary_order); out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "")); + out << ", " << "repetition_level_histograms="; (__isset.repetition_level_histograms ? (out << to_string(repetition_level_histograms)) : (out << "")); + out << ", " << "definition_level_histograms="; (__isset.definition_level_histograms ? (out << to_string(definition_level_histograms)) : (out << "")); out << ")"; } @@ -7624,30 +8048,30 @@ void swap(AesGcmV1 &a, AesGcmV1 &b) { swap(a.__isset, b.__isset); } -AesGcmV1::AesGcmV1(const AesGcmV1& other275) { - aad_prefix = other275.aad_prefix; - aad_file_unique = other275.aad_file_unique; - supply_aad_prefix = other275.supply_aad_prefix; - __isset = other275.__isset; +AesGcmV1::AesGcmV1(const AesGcmV1& other309) { + aad_prefix = other309.aad_prefix; + aad_file_unique = other309.aad_file_unique; + supply_aad_prefix = other309.supply_aad_prefix; + __isset = other309.__isset; } -AesGcmV1::AesGcmV1(AesGcmV1&& other276) noexcept { - aad_prefix = std::move(other276.aad_prefix); - aad_file_unique = std::move(other276.aad_file_unique); - supply_aad_prefix = other276.supply_aad_prefix; - __isset = other276.__isset; +AesGcmV1::AesGcmV1(AesGcmV1&& other310) noexcept { + aad_prefix = std::move(other310.aad_prefix); + aad_file_unique = std::move(other310.aad_file_unique); + supply_aad_prefix = other310.supply_aad_prefix; + __isset = other310.__isset; } -AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other277) { - aad_prefix = other277.aad_prefix; - aad_file_unique = other277.aad_file_unique; - supply_aad_prefix = other277.supply_aad_prefix; - __isset = other277.__isset; +AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other311) { + aad_prefix = other311.aad_prefix; + aad_file_unique = other311.aad_file_unique; + supply_aad_prefix = other311.supply_aad_prefix; + __isset = other311.__isset; return *this; } -AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other278) noexcept { - aad_prefix = std::move(other278.aad_prefix); - aad_file_unique = std::move(other278.aad_file_unique); - supply_aad_prefix = other278.supply_aad_prefix; - __isset = other278.__isset; +AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other312) noexcept { + aad_prefix = std::move(other312.aad_prefix); + aad_file_unique = std::move(other312.aad_file_unique); + supply_aad_prefix = other312.supply_aad_prefix; + __isset = other312.__isset; return *this; } void AesGcmV1::printTo(std::ostream& out) const { @@ -7775,30 +8199,30 @@ void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) { swap(a.__isset, b.__isset); } -AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other279) { - aad_prefix = other279.aad_prefix; - aad_file_unique = other279.aad_file_unique; - supply_aad_prefix = other279.supply_aad_prefix; - __isset = other279.__isset; +AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other313) { + aad_prefix = other313.aad_prefix; + aad_file_unique = other313.aad_file_unique; + supply_aad_prefix = other313.supply_aad_prefix; + __isset = other313.__isset; } -AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other280) noexcept { - aad_prefix = std::move(other280.aad_prefix); - aad_file_unique = std::move(other280.aad_file_unique); - supply_aad_prefix = other280.supply_aad_prefix; - __isset = other280.__isset; +AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other314) noexcept { + aad_prefix = std::move(other314.aad_prefix); + aad_file_unique = std::move(other314.aad_file_unique); + supply_aad_prefix = other314.supply_aad_prefix; + __isset = other314.__isset; } -AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other281) { - aad_prefix = other281.aad_prefix; - aad_file_unique = other281.aad_file_unique; - supply_aad_prefix = other281.supply_aad_prefix; - __isset = other281.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other315) { + aad_prefix = other315.aad_prefix; + aad_file_unique = other315.aad_file_unique; + supply_aad_prefix = other315.supply_aad_prefix; + __isset = other315.__isset; return *this; } -AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other282) noexcept { - aad_prefix = std::move(other282.aad_prefix); - aad_file_unique = std::move(other282.aad_file_unique); - supply_aad_prefix = other282.supply_aad_prefix; - __isset = other282.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other316) noexcept { + aad_prefix = std::move(other316.aad_prefix); + aad_file_unique = std::move(other316.aad_file_unique); + supply_aad_prefix = other316.supply_aad_prefix; + __isset = other316.__isset; return *this; } void AesGcmCtrV1::printTo(std::ostream& out) const { @@ -7907,26 +8331,26 @@ void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) { swap(a.__isset, b.__isset); } -EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other283) { - AES_GCM_V1 = other283.AES_GCM_V1; - AES_GCM_CTR_V1 = other283.AES_GCM_CTR_V1; - __isset = other283.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other317) { + AES_GCM_V1 = other317.AES_GCM_V1; + AES_GCM_CTR_V1 = other317.AES_GCM_CTR_V1; + __isset = other317.__isset; } -EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other284) noexcept { - AES_GCM_V1 = std::move(other284.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other284.AES_GCM_CTR_V1); - __isset = other284.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other318) noexcept { + AES_GCM_V1 = std::move(other318.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other318.AES_GCM_CTR_V1); + __isset = other318.__isset; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other285) { - AES_GCM_V1 = other285.AES_GCM_V1; - AES_GCM_CTR_V1 = other285.AES_GCM_CTR_V1; - __isset = other285.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other319) { + AES_GCM_V1 = other319.AES_GCM_V1; + AES_GCM_CTR_V1 = other319.AES_GCM_CTR_V1; + __isset = other319.__isset; return *this; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other286) noexcept { - AES_GCM_V1 = std::move(other286.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other286.AES_GCM_CTR_V1); - __isset = other286.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other320) noexcept { + AES_GCM_V1 = std::move(other320.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other320.AES_GCM_CTR_V1); + __isset = other320.__isset; return *this; } void EncryptionAlgorithm::printTo(std::ostream& out) const { @@ -8026,14 +8450,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->schema.clear(); - uint32_t _size287; - ::apache::thrift::protocol::TType _etype290; - xfer += iprot->readListBegin(_etype290, _size287); - this->schema.resize(_size287); - uint32_t _i291; - for (_i291 = 0; _i291 < _size287; ++_i291) + uint32_t _size321; + ::apache::thrift::protocol::TType _etype324; + xfer += iprot->readListBegin(_etype324, _size321); + this->schema.resize(_size321); + uint32_t _i325; + for (_i325 = 0; _i325 < _size321; ++_i325) { - xfer += this->schema[_i291].read(iprot); + xfer += this->schema[_i325].read(iprot); } xfer += iprot->readListEnd(); } @@ -8054,14 +8478,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->row_groups.clear(); - uint32_t _size292; - ::apache::thrift::protocol::TType _etype295; - xfer += iprot->readListBegin(_etype295, _size292); - this->row_groups.resize(_size292); - uint32_t _i296; - for (_i296 = 0; _i296 < _size292; ++_i296) + uint32_t _size326; + ::apache::thrift::protocol::TType _etype329; + xfer += iprot->readListBegin(_etype329, _size326); + this->row_groups.resize(_size326); + uint32_t _i330; + for (_i330 = 0; _i330 < _size326; ++_i330) { - xfer += this->row_groups[_i296].read(iprot); + xfer += this->row_groups[_i330].read(iprot); } xfer += iprot->readListEnd(); } @@ -8074,14 +8498,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size297; - ::apache::thrift::protocol::TType _etype300; - xfer += iprot->readListBegin(_etype300, _size297); - this->key_value_metadata.resize(_size297); - uint32_t _i301; - for (_i301 = 0; _i301 < _size297; ++_i301) + uint32_t _size331; + ::apache::thrift::protocol::TType _etype334; + xfer += iprot->readListBegin(_etype334, _size331); + this->key_value_metadata.resize(_size331); + uint32_t _i335; + for (_i335 = 0; _i335 < _size331; ++_i335) { - xfer += this->key_value_metadata[_i301].read(iprot); + xfer += this->key_value_metadata[_i335].read(iprot); } xfer += iprot->readListEnd(); } @@ -8102,14 +8526,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->column_orders.clear(); - uint32_t _size302; - ::apache::thrift::protocol::TType _etype305; - xfer += iprot->readListBegin(_etype305, _size302); - this->column_orders.resize(_size302); - uint32_t _i306; - for (_i306 = 0; _i306 < _size302; ++_i306) + uint32_t _size336; + ::apache::thrift::protocol::TType _etype339; + xfer += iprot->readListBegin(_etype339, _size336); + this->column_orders.resize(_size336); + uint32_t _i340; + for (_i340 = 0; _i340 < _size336; ++_i340) { - xfer += this->column_orders[_i306].read(iprot); + xfer += this->column_orders[_i340].read(iprot); } xfer += iprot->readListEnd(); } @@ -8166,10 +8590,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->schema.size())); - std::vector ::const_iterator _iter307; - for (_iter307 = this->schema.begin(); _iter307 != this->schema.end(); ++_iter307) + std::vector ::const_iterator _iter341; + for (_iter341 = this->schema.begin(); _iter341 != this->schema.end(); ++_iter341) { - xfer += (*_iter307).write(oprot); + xfer += (*_iter341).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8182,10 +8606,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->row_groups.size())); - std::vector ::const_iterator _iter308; - for (_iter308 = this->row_groups.begin(); _iter308 != this->row_groups.end(); ++_iter308) + std::vector ::const_iterator _iter342; + for (_iter342 = this->row_groups.begin(); _iter342 != this->row_groups.end(); ++_iter342) { - xfer += (*_iter308).write(oprot); + xfer += (*_iter342).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8195,10 +8619,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter309; - for (_iter309 = this->key_value_metadata.begin(); _iter309 != this->key_value_metadata.end(); ++_iter309) + std::vector ::const_iterator _iter343; + for (_iter343 = this->key_value_metadata.begin(); _iter343 != this->key_value_metadata.end(); ++_iter343) { - xfer += (*_iter309).write(oprot); + xfer += (*_iter343).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8213,10 +8637,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->column_orders.size())); - std::vector ::const_iterator _iter310; - for (_iter310 = this->column_orders.begin(); _iter310 != this->column_orders.end(); ++_iter310) + std::vector ::const_iterator _iter344; + for (_iter344 = this->column_orders.begin(); _iter344 != this->column_orders.end(); ++_iter344) { - xfer += (*_iter310).write(oprot); + xfer += (*_iter344).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8251,54 +8675,54 @@ void swap(FileMetaData &a, FileMetaData &b) { swap(a.__isset, b.__isset); } -FileMetaData::FileMetaData(const FileMetaData& other311) { - version = other311.version; - schema = other311.schema; - num_rows = other311.num_rows; - row_groups = other311.row_groups; - key_value_metadata = other311.key_value_metadata; - created_by = other311.created_by; - column_orders = other311.column_orders; - encryption_algorithm = other311.encryption_algorithm; - footer_signing_key_metadata = other311.footer_signing_key_metadata; - __isset = other311.__isset; -} -FileMetaData::FileMetaData(FileMetaData&& other312) noexcept { - version = other312.version; - schema = std::move(other312.schema); - num_rows = other312.num_rows; - row_groups = std::move(other312.row_groups); - key_value_metadata = std::move(other312.key_value_metadata); - created_by = std::move(other312.created_by); - column_orders = std::move(other312.column_orders); - encryption_algorithm = std::move(other312.encryption_algorithm); - footer_signing_key_metadata = std::move(other312.footer_signing_key_metadata); - __isset = other312.__isset; -} -FileMetaData& FileMetaData::operator=(const FileMetaData& other313) { - version = other313.version; - schema = other313.schema; - num_rows = other313.num_rows; - row_groups = other313.row_groups; - key_value_metadata = other313.key_value_metadata; - created_by = other313.created_by; - column_orders = other313.column_orders; - encryption_algorithm = other313.encryption_algorithm; - footer_signing_key_metadata = other313.footer_signing_key_metadata; - __isset = other313.__isset; +FileMetaData::FileMetaData(const FileMetaData& other345) { + version = other345.version; + schema = other345.schema; + num_rows = other345.num_rows; + row_groups = other345.row_groups; + key_value_metadata = other345.key_value_metadata; + created_by = other345.created_by; + column_orders = other345.column_orders; + encryption_algorithm = other345.encryption_algorithm; + footer_signing_key_metadata = other345.footer_signing_key_metadata; + __isset = other345.__isset; +} +FileMetaData::FileMetaData(FileMetaData&& other346) noexcept { + version = other346.version; + schema = std::move(other346.schema); + num_rows = other346.num_rows; + row_groups = std::move(other346.row_groups); + key_value_metadata = std::move(other346.key_value_metadata); + created_by = std::move(other346.created_by); + column_orders = std::move(other346.column_orders); + encryption_algorithm = std::move(other346.encryption_algorithm); + footer_signing_key_metadata = std::move(other346.footer_signing_key_metadata); + __isset = other346.__isset; +} +FileMetaData& FileMetaData::operator=(const FileMetaData& other347) { + version = other347.version; + schema = other347.schema; + num_rows = other347.num_rows; + row_groups = other347.row_groups; + key_value_metadata = other347.key_value_metadata; + created_by = other347.created_by; + column_orders = other347.column_orders; + encryption_algorithm = other347.encryption_algorithm; + footer_signing_key_metadata = other347.footer_signing_key_metadata; + __isset = other347.__isset; return *this; } -FileMetaData& FileMetaData::operator=(FileMetaData&& other314) noexcept { - version = other314.version; - schema = std::move(other314.schema); - num_rows = other314.num_rows; - row_groups = std::move(other314.row_groups); - key_value_metadata = std::move(other314.key_value_metadata); - created_by = std::move(other314.created_by); - column_orders = std::move(other314.column_orders); - encryption_algorithm = std::move(other314.encryption_algorithm); - footer_signing_key_metadata = std::move(other314.footer_signing_key_metadata); - __isset = other314.__isset; +FileMetaData& FileMetaData::operator=(FileMetaData&& other348) noexcept { + version = other348.version; + schema = std::move(other348.schema); + num_rows = other348.num_rows; + row_groups = std::move(other348.row_groups); + key_value_metadata = std::move(other348.key_value_metadata); + created_by = std::move(other348.created_by); + column_orders = std::move(other348.column_orders); + encryption_algorithm = std::move(other348.encryption_algorithm); + footer_signing_key_metadata = std::move(other348.footer_signing_key_metadata); + __isset = other348.__isset; return *this; } void FileMetaData::printTo(std::ostream& out) const { @@ -8414,26 +8838,26 @@ void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) { swap(a.__isset, b.__isset); } -FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other315) { - encryption_algorithm = other315.encryption_algorithm; - key_metadata = other315.key_metadata; - __isset = other315.__isset; +FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other349) { + encryption_algorithm = other349.encryption_algorithm; + key_metadata = other349.key_metadata; + __isset = other349.__isset; } -FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other316) noexcept { - encryption_algorithm = std::move(other316.encryption_algorithm); - key_metadata = std::move(other316.key_metadata); - __isset = other316.__isset; +FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other350) noexcept { + encryption_algorithm = std::move(other350.encryption_algorithm); + key_metadata = std::move(other350.key_metadata); + __isset = other350.__isset; } -FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other317) { - encryption_algorithm = other317.encryption_algorithm; - key_metadata = other317.key_metadata; - __isset = other317.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other351) { + encryption_algorithm = other351.encryption_algorithm; + key_metadata = other351.key_metadata; + __isset = other351.__isset; return *this; } -FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other318) noexcept { - encryption_algorithm = std::move(other318.encryption_algorithm); - key_metadata = std::move(other318.key_metadata); - __isset = other318.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other352) noexcept { + encryption_algorithm = std::move(other352.encryption_algorithm); + key_metadata = std::move(other352.key_metadata); + __isset = other352.__isset; return *this; } void FileCryptoMetaData::printTo(std::ostream& out) const { diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h index 199b4ae747667..9dc6794c4030b 100644 --- a/cpp/src/generated/parquet_types.h +++ b/cpp/src/generated/parquet_types.h @@ -1,5 +1,5 @@ /** - * Autogenerated by Thrift Compiler (0.18.1) + * Autogenerated by Thrift Compiler (0.19.0) * * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING * @generated @@ -345,6 +345,8 @@ std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val); std::string to_string(const BoundaryOrder::type& val); +class SizeStatistics; + class Statistics; class StringType; @@ -449,14 +451,121 @@ class FileMetaData; class FileCryptoMetaData; +typedef struct _SizeStatistics__isset { + _SizeStatistics__isset() : unencoded_byte_array_data_bytes(false), repetition_level_histogram(false), definition_level_histogram(false) {} + bool unencoded_byte_array_data_bytes :1; + bool repetition_level_histogram :1; + bool definition_level_histogram :1; +} _SizeStatistics__isset; + +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine grained filter pushdown on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +class SizeStatistics : public virtual ::apache::thrift::TBase { + public: + + SizeStatistics(const SizeStatistics&); + SizeStatistics(SizeStatistics&&) noexcept; + SizeStatistics& operator=(const SizeStatistics&); + SizeStatistics& operator=(SizeStatistics&&) noexcept; + SizeStatistics() noexcept + : unencoded_byte_array_data_bytes(0) { + } + + virtual ~SizeStatistics() noexcept; + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + int64_t unencoded_byte_array_data_bytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + * + */ + std::vector repetition_level_histogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + * + */ + std::vector definition_level_histogram; + + _SizeStatistics__isset __isset; + + void __set_unencoded_byte_array_data_bytes(const int64_t val); + + void __set_repetition_level_histogram(const std::vector & val); + + void __set_definition_level_histogram(const std::vector & val); + + bool operator == (const SizeStatistics & rhs) const + { + if (__isset.unencoded_byte_array_data_bytes != rhs.__isset.unencoded_byte_array_data_bytes) + return false; + else if (__isset.unencoded_byte_array_data_bytes && !(unencoded_byte_array_data_bytes == rhs.unencoded_byte_array_data_bytes)) + return false; + if (__isset.repetition_level_histogram != rhs.__isset.repetition_level_histogram) + return false; + else if (__isset.repetition_level_histogram && !(repetition_level_histogram == rhs.repetition_level_histogram)) + return false; + if (__isset.definition_level_histogram != rhs.__isset.definition_level_histogram) + return false; + else if (__isset.definition_level_histogram && !(definition_level_histogram == rhs.definition_level_histogram)) + return false; + return true; + } + bool operator != (const SizeStatistics &rhs) const { + return !(*this == rhs); + } + + bool operator < (const SizeStatistics & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot) override; + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const override; + + virtual void printTo(std::ostream& out) const; +}; + +void swap(SizeStatistics &a, SizeStatistics &b); + +std::ostream& operator<<(std::ostream& out, const SizeStatistics& obj); + typedef struct _Statistics__isset { - _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {} + _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false), is_max_value_exact(false), is_min_value_exact(false) {} bool max :1; bool min :1; bool null_count :1; bool distinct_count :1; bool max_value :1; bool min_value :1; + bool is_max_value_exact :1; + bool is_min_value_exact :1; } _Statistics__isset; /** @@ -476,7 +585,9 @@ class Statistics : public virtual ::apache::thrift::TBase { null_count(0), distinct_count(0), max_value(), - min_value() { + min_value(), + is_max_value_exact(0), + is_min_value_exact(0) { } virtual ~Statistics() noexcept; @@ -504,13 +615,27 @@ class Statistics : public virtual ::apache::thrift::TBase { */ int64_t distinct_count; /** - * Min and max values for the column, determined by its ColumnOrder. + * Lower and upper bound values for the column, determined by its ColumnOrder. + * + * These may be the actual minimum and maximum values found on a page or column + * chunk, but can also be (more compact) values that do not exist on a page or + * column chunk. For example, instead of storing "Blart Versenwald III", a writer + * may set min_value="B", max_value="C". Such more compact values must still be + * valid values within the column's logical type. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ std::string max_value; std::string min_value; + /** + * If true, max_value is the actual maximum value for a column + */ + bool is_max_value_exact; + /** + * If true, min_value is the actual minimum value for a column + */ + bool is_min_value_exact; _Statistics__isset __isset; @@ -526,6 +651,10 @@ class Statistics : public virtual ::apache::thrift::TBase { void __set_min_value(const std::string& val); + void __set_is_max_value_exact(const bool val); + + void __set_is_min_value_exact(const bool val); + bool operator == (const Statistics & rhs) const { if (__isset.max != rhs.__isset.max) @@ -552,6 +681,14 @@ class Statistics : public virtual ::apache::thrift::TBase { return false; else if (__isset.min_value && !(min_value == rhs.min_value)) return false; + if (__isset.is_max_value_exact != rhs.__isset.is_max_value_exact) + return false; + else if (__isset.is_max_value_exact && !(is_max_value_exact == rhs.is_max_value_exact)) + return false; + if (__isset.is_min_value_exact != rhs.__isset.is_min_value_exact) + return false; + else if (__isset.is_min_value_exact && !(is_min_value_exact == rhs.is_min_value_exact)) + return false; return true; } bool operator != (const Statistics &rhs) const { @@ -848,6 +985,9 @@ std::ostream& operator<<(std::ostream& out, const NullType& obj); /** * Decimal logical type annotation * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * * To maintain forward-compatibility in v1, implementations using this logical * type must also set scale and precision on the annotated SchemaElement. * @@ -1670,7 +1810,7 @@ class DataPageHeader : public virtual ::apache::thrift::TBase { */ Encoding::type repetition_level_encoding; /** - * Optional statistics for the data in this page* + * Optional statistics for the data in this page * */ Statistics statistics; @@ -1877,15 +2017,15 @@ class DataPageHeaderV2 : public virtual ::apache::thrift::TBase { */ Encoding::type encoding; /** - * length of the definition levels + * Length of the definition levels */ int32_t definition_levels_byte_length; /** - * length of the repetition levels + * Length of the repetition levels */ int32_t repetition_levels_byte_length; /** - * whether the values are compressed. + * Whether the values are compressed. * Which means the section of the page between * definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) * is compressed with the compression_codec. @@ -1893,7 +2033,7 @@ class DataPageHeaderV2 : public virtual ::apache::thrift::TBase { */ bool is_compressed; /** - * optional statistics for the data in this page * + * Optional statistics for the data in this page * */ Statistics statistics; @@ -2603,13 +2743,15 @@ void swap(PageEncodingStats &a, PageEncodingStats &b); std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj); typedef struct _ColumnMetaData__isset { - _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {} + _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false), bloom_filter_length(false), size_statistics(false) {} bool key_value_metadata :1; bool index_page_offset :1; bool dictionary_page_offset :1; bool statistics :1; bool encoding_stats :1; bool bloom_filter_offset :1; + bool bloom_filter_length :1; + bool size_statistics :1; } _ColumnMetaData__isset; /** @@ -2631,7 +2773,8 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), - bloom_filter_offset(0) { + bloom_filter_offset(0), + bloom_filter_length(0) { } virtual ~ColumnMetaData() noexcept; @@ -2699,6 +2842,21 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { * Byte offset from beginning of file to Bloom filter data. * */ int64_t bloom_filter_offset; + /** + * Size of Bloom filter data including the serialized header, in bytes. + * Added in 2.10 so readers may not read this field from old files and + * it can be obtained after the BloomFilterHeader has been deserialized. + * Writers should write this field so readers can read the bloom filter + * in a single I/O. + */ + int32_t bloom_filter_length; + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + SizeStatistics size_statistics; _ColumnMetaData__isset __isset; @@ -2730,6 +2888,10 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { void __set_bloom_filter_offset(const int64_t val); + void __set_bloom_filter_length(const int32_t val); + + void __set_size_statistics(const SizeStatistics& val); + bool operator == (const ColumnMetaData & rhs) const { if (!(type == rhs.type)) @@ -2772,6 +2934,14 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { return false; else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset)) return false; + if (__isset.bloom_filter_length != rhs.__isset.bloom_filter_length) + return false; + else if (__isset.bloom_filter_length && !(bloom_filter_length == rhs.bloom_filter_length)) + return false; + if (__isset.size_statistics != rhs.__isset.size_statistics) + return false; + else if (__isset.size_statistics && !(size_statistics == rhs.size_statistics)) + return false; return true; } bool operator != (const ColumnMetaData &rhs) const { @@ -3403,6 +3573,10 @@ void swap(PageLocation &a, PageLocation &b); std::ostream& operator<<(std::ostream& out, const PageLocation& obj); +typedef struct _OffsetIndex__isset { + _OffsetIndex__isset() : unencoded_byte_array_data_bytes(false) {} + bool unencoded_byte_array_data_bytes :1; +} _OffsetIndex__isset; class OffsetIndex : public virtual ::apache::thrift::TBase { public: @@ -3420,13 +3594,28 @@ class OffsetIndex : public virtual ::apache::thrift::TBase { * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ std::vector page_locations; + /** + * Unencoded/uncompressed size for BYTE_ARRAY types. + * + * See documention for unencoded_byte_array_data_bytes in SizeStatistics for + * more details on this field. + */ + std::vector unencoded_byte_array_data_bytes; + + _OffsetIndex__isset __isset; void __set_page_locations(const std::vector & val); + void __set_unencoded_byte_array_data_bytes(const std::vector & val); + bool operator == (const OffsetIndex & rhs) const { if (!(page_locations == rhs.page_locations)) return false; + if (__isset.unencoded_byte_array_data_bytes != rhs.__isset.unencoded_byte_array_data_bytes) + return false; + else if (__isset.unencoded_byte_array_data_bytes && !(unencoded_byte_array_data_bytes == rhs.unencoded_byte_array_data_bytes)) + return false; return true; } bool operator != (const OffsetIndex &rhs) const { @@ -3446,8 +3635,10 @@ void swap(OffsetIndex &a, OffsetIndex &b); std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj); typedef struct _ColumnIndex__isset { - _ColumnIndex__isset() : null_counts(false) {} + _ColumnIndex__isset() : null_counts(false), repetition_level_histograms(false), definition_level_histograms(false) {} bool null_counts :1; + bool repetition_level_histograms :1; + bool definition_level_histograms :1; } _ColumnIndex__isset; /** @@ -3499,6 +3690,25 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { * A list containing the number of null values for each page * */ std::vector null_counts; + /** + * Contains repetition level histograms for each page + * concatenated together. The repetition_level_histogram field on + * SizeStatistics contains more details. + * + * When present the length should always be (number of pages * + * (max_repetition_level + 1)) elements. + * + * Element 0 is the first element of the histogram for the first page. + * Element (max_repetition_level + 1) is the first element of the histogram + * for the second page. + * + */ + std::vector repetition_level_histograms; + /** + * Same as repetition_level_histograms except for definitions levels. + * + */ + std::vector definition_level_histograms; _ColumnIndex__isset __isset; @@ -3512,6 +3722,10 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { void __set_null_counts(const std::vector & val); + void __set_repetition_level_histograms(const std::vector & val); + + void __set_definition_level_histograms(const std::vector & val); + bool operator == (const ColumnIndex & rhs) const { if (!(null_pages == rhs.null_pages)) @@ -3526,6 +3740,14 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { return false; else if (__isset.null_counts && !(null_counts == rhs.null_counts)) return false; + if (__isset.repetition_level_histograms != rhs.__isset.repetition_level_histograms) + return false; + else if (__isset.repetition_level_histograms && !(repetition_level_histograms == rhs.repetition_level_histograms)) + return false; + if (__isset.definition_level_histograms != rhs.__isset.definition_level_histograms) + return false; + else if (__isset.definition_level_histograms && !(definition_level_histograms == rhs.definition_level_histograms)) + return false; return true; } bool operator != (const ColumnIndex &rhs) const { diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 919c97f4323b6..2d20403eac075 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -830,6 +830,8 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a7e7b2f93e174..8b4f4d33d9699 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -131,6 +131,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Union); NOT_IMPLEMENTED_VISIT(List); NOT_IMPLEMENTED_VISIT(LargeList); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); NOT_IMPLEMENTED_VISIT(Struct); NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 0d354f5c1ac0c..59fc848d7fd57 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -308,8 +308,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); column_properties.set_codec_options(codec_options); - std::shared_ptr> writer = this->BuildWriter( - num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); + std::shared_ptr> writer = + this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0, + ParquetDataPageVersion::V1, enable_checksum); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); // The behaviour should be independent from the number of Close() calls writer->Close(); @@ -557,7 +558,7 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndBrotliCompression) { #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCompression) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::GZIP, false, false, LARGE_SIZE); diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 4377e714a240b..6b3c7062fcc4a 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -91,7 +91,7 @@ static std::vector GetSupportedCodecTypes() { codec_types.push_back(Compression::BROTLI); #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB codec_types.push_back(Compression::GZIP); #endif diff --git a/cpp/src/parquet/file_serialize_test.cc b/cpp/src/parquet/file_serialize_test.cc index 85bfd1c5147a8..62e1965418076 100644 --- a/cpp/src/parquet/file_serialize_test.cc +++ b/cpp/src/parquet/file_serialize_test.cc @@ -334,7 +334,7 @@ TYPED_TEST(TestSerialize, SmallFileBrotli) { } #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB TYPED_TEST(TestSerialize, SmallFileGzip) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::GZIP)); } diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index d802166be66e8..a1883d335aa23 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -20,7 +20,6 @@ /** * File format description for the parquet file format */ - cpp_include "parquet/windows_compatibility.h" namespace cpp parquet.format namespace java org.apache.parquet.format @@ -193,6 +192,52 @@ enum FieldRepetitionType { REPEATED = 2; } +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine grained filter pushdown on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +struct SizeStatistics { + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + 1: optional i64 unencoded_byte_array_data_bytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + **/ + 2: optional list repetition_level_histogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + **/ + 3: optional list definition_level_histogram; +} + /** * Statistics per row group and per page * All fields are optional. @@ -218,13 +263,23 @@ struct Statistics { /** count of distinct values occurring */ 4: optional i64 distinct_count; /** - * Min and max values for the column, determined by its ColumnOrder. + * Lower and upper bound values for the column, determined by its ColumnOrder. + * + * These may be the actual minimum and maximum values found on a page or column + * chunk, but can also be (more compact) values that do not exist on a page or + * column chunk. For example, instead of storing "Blart Versenwald III", a writer + * may set min_value="B", max_value="C". Such more compact values must still be + * valid values within the column's logical type. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; + /** If true, max_value is the actual maximum value for a column */ + 7: optional bool is_max_value_exact; + /** If true, min_value is the actual minimum value for a column */ + 8: optional bool is_min_value_exact; } /** Empty structs to use as logical type annotations */ @@ -234,7 +289,7 @@ struct MapType {} // see LogicalTypes.md struct ListType {} // see LogicalTypes.md struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 struct DateType {} // allowed for INT32 -struct Float16Type{} // allowed for FIXED[2], must encode raw FLOAT16 bytes +struct Float16Type {} // allowed for FIXED[2], must encoded raw FLOAT16 bytes /** * Logical type to annotate a column that is always null. @@ -248,6 +303,9 @@ struct NullType {} // allowed for any physical type, only null values stored /** * Decimal logical type annotation * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * * To maintain forward-compatibility in v1, implementations using this logical * type must also set scale and precision on the annotated SchemaElement. * @@ -530,7 +588,7 @@ struct DataPageHeader { /** Encoding used for repetition levels **/ 4: required Encoding repetition_level_encoding; - /** Optional statistics for the data in this page**/ + /** Optional statistics for the data in this page **/ 5: optional Statistics statistics; } @@ -572,19 +630,19 @@ struct DataPageHeaderV2 { // repetition levels and definition levels are always using RLE (without size in it) - /** length of the definition levels */ + /** Length of the definition levels */ 5: required i32 definition_levels_byte_length; - /** length of the repetition levels */ + /** Length of the repetition levels */ 6: required i32 repetition_levels_byte_length; - /** whether the values are compressed. + /** Whether the values are compressed. Which means the section of the page between definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) is compressed with the compression_codec. If missing it is considered compressed */ - 7: optional bool is_compressed = 1; + 7: optional bool is_compressed = true; - /** optional statistics for the data in this page **/ + /** Optional statistics for the data in this page **/ 8: optional Statistics statistics; } @@ -597,11 +655,11 @@ union BloomFilterAlgorithm { } /** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash - * algorithm. It uses 64 bits version of xxHash. + * algorithm. It uses 64 bits version of xxHash. **/ struct XxHash {} -/** +/** * The hash function used in Bloom filter. This function takes the hash of a column value * using plain encoding. **/ @@ -757,6 +815,22 @@ struct ColumnMetaData { /** Byte offset from beginning of file to Bloom filter data. **/ 14: optional i64 bloom_filter_offset; + + /** Size of Bloom filter data including the serialized header, in bytes. + * Added in 2.10 so readers may not read this field from old files and + * it can be obtained after the BloomFilterHeader has been deserialized. + * Writers should write this field so readers can read the bloom filter + * in a single I/O. + */ + 15: optional i32 bloom_filter_length; + + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + 16: optional SizeStatistics size_statistics; } struct EncryptionWithFooterKey { @@ -765,7 +839,7 @@ struct EncryptionWithFooterKey { struct EncryptionWithColumnKey { /** Column path in schema **/ 1: required list path_in_schema - + /** Retrieval metadata of column encryption key **/ 2: optional binary key_metadata } @@ -804,7 +878,7 @@ struct ColumnChunk { /** Crypto metadata of encrypted columns **/ 8: optional ColumnCryptoMetaData crypto_metadata - + /** Encrypted column metadata for this chunk **/ 9: optional binary encrypted_column_metadata } @@ -897,7 +971,7 @@ union ColumnOrder { * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. * - When looking for NaN values, min and max should be ignored. - * + * * When writing statistics the following rules should be followed: * - NaNs should not be written to min or max statistics fields. * - If the computed max value is zero (whether negative or positive), @@ -931,6 +1005,13 @@ struct OffsetIndex { * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ 1: required list page_locations + /** + * Unencoded/uncompressed size for BYTE_ARRAY types. + * + * See documention for unencoded_byte_array_data_bytes in SizeStatistics for + * more details on this field. + */ + 2: optional list unencoded_byte_array_data_bytes } /** @@ -970,6 +1051,25 @@ struct ColumnIndex { /** A list containing the number of null values for each page **/ 5: optional list null_counts + + /** + * Contains repetition level histograms for each page + * concatenated together. The repetition_level_histogram field on + * SizeStatistics contains more details. + * + * When present the length should always be (number of pages * + * (max_repetition_level + 1)) elements. + * + * Element 0 is the first element of the histogram for the first page. + * Element (max_repetition_level + 1) is the first element of the histogram + * for the second page. + **/ + 6: optional list repetition_level_histograms; + /** + * Same as repetition_level_histograms except for definitions levels. + **/ + 7: optional list definition_level_histograms; + } struct AesGcmV1 { @@ -978,7 +1078,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -990,7 +1090,7 @@ struct AesGcmCtrV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 89b685a64c311..da467dac2f095 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 89b685a64c3117b3023d8684af1f41400841db71 +Subproject commit da467dac2f095b979af37bcf40fa0d1dee5ff652 diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 67b37e49c7dc5..1849bf11b7439 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 1eec449077479..62d5858fadeb2 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -16,7 +16,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index a7ddb14af6a10..1bd4035d5b9da 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -18,10 +18,11 @@ using System.Collections.Generic; using System.Runtime.CompilerServices; using Apache.Arrow.Memory; +using System.Collections; namespace Apache.Arrow { - public class BinaryArray : Array + public class BinaryArray : Array, IReadOnlyList { public class Builder : BuilderBase { @@ -366,5 +367,18 @@ public ReadOnlySpan GetBytes(int index, out bool isNull) return ValueBuffer.Span.Slice(ValueOffsets[index], GetValueLength(index)); } + + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs index 0915338fe6a91..e9c5f8979e48f 100644 --- a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -16,11 +16,12 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; using System; +using System.Collections; using System.Collections.Generic; namespace Apache.Arrow { - public class BooleanArray: Array + public class BooleanArray: Array, IReadOnlyList { public class Builder : IArrowArrayBuilder { @@ -190,5 +191,19 @@ public bool GetBoolean(int index) ? (bool?)null : BitUtility.GetBit(ValueBuffer.Span, index + Offset); } + + int IReadOnlyCollection.Count => Length; + + bool? IReadOnlyList.this[int index] => GetValue(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetValue(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 23ad7356eb322..6ab4986f573e2 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; namespace Apache.Arrow { @@ -22,7 +23,10 @@ namespace Apache.Arrow /// The class holds an array of dates in the Date32 format, where each date is /// stored as the number of days since the dawn of (UNIX) time. /// - public class Date32Array : PrimitiveArray + public class Date32Array : PrimitiveArray, IReadOnlyList +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); #if NET6_0_OR_GREATER @@ -133,6 +137,30 @@ public Date32Array(ArrayData data) ? DateOnly.FromDayNumber(_epochDayNumber + value.Value) : default(DateOnly?); } + + int IReadOnlyCollection.Count => Length; + + DateOnly? IReadOnlyList.this[int index] => GetDateOnly(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateOnly(index); + }; + } #endif + + int IReadOnlyCollection.Count => Length; + + DateTime? IReadOnlyList.this[int index] => GetDateTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateTime(index); + }; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index b0d42e27bbd23..43e698e10b25c 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; namespace Apache.Arrow { @@ -23,7 +24,10 @@ namespace Apache.Arrow /// stored as the number of milliseconds since the dawn of (UNIX) time, excluding leap seconds, in multiples of /// 86400000. /// - public class Date64Array: PrimitiveArray + public class Date64Array : PrimitiveArray, IReadOnlyList +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { private const long MillisecondsPerDay = 86400000; @@ -39,7 +43,7 @@ public Date64Array( /// public class Builder : DateArrayBuilder { - private class DateBuilder: PrimitiveArrayBuilder + private class DateBuilder : PrimitiveArrayBuilder { protected override Date64Array Build( ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, @@ -135,6 +139,30 @@ public Date64Array(ArrayData data) ? DateOnly.FromDateTime(DateTimeOffset.FromUnixTimeMilliseconds(value.Value).UtcDateTime) : default(DateOnly?); } + + int IReadOnlyCollection.Count => Length; + + DateOnly? IReadOnlyList.this[int index] => GetDateOnly(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateOnly(index); + }; + } #endif + + int IReadOnlyCollection.Count => Length; + + DateTime? IReadOnlyList.this[int index] => GetDateTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateTime(index); + }; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs index 7365a77b6329e..0456c5cc65ba4 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs @@ -14,12 +14,13 @@ // limitations under the License. using System; +using System.Collections; using System.Collections.Generic; using System.Runtime.CompilerServices; namespace Apache.Arrow { - public abstract class PrimitiveArray : Array + public abstract class PrimitiveArray : Array, IReadOnlyList where T : struct { protected PrimitiveArray(ArrayData data) @@ -66,5 +67,24 @@ protected PrimitiveArray(ArrayData data) return list; } + + int IReadOnlyCollection.Count => Length; + T? IReadOnlyList.this[int index] => GetValue(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return IsValid(index) ? Values[index] : null; + } + } + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return IsValid(index) ? Values[index] : null; + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index 42104b27175a9..af77fe1b1a83d 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -15,13 +15,14 @@ using Apache.Arrow.Types; using System; +using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; namespace Apache.Arrow { - public class StringArray: BinaryArray + public class StringArray: BinaryArray, IReadOnlyList { public static readonly Encoding DefaultEncoding = Encoding.UTF8; @@ -91,5 +92,19 @@ public string GetString(int index, Encoding encoding = default) return encoding.GetString(data, bytes.Length); } } + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/StructArray.cs b/csharp/src/Apache.Arrow/Arrays/StructArray.cs index 11d40e6d4e886..5b827c7b85e85 100644 --- a/csharp/src/Apache.Arrow/Arrays/StructArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StructArray.cs @@ -72,11 +72,11 @@ private IReadOnlyList InitializeFields() IRecordType IArrowRecord.Schema => (StructType)Data.DataType; - int IArrowRecord.ColumnCount => _fields.Count; + int IArrowRecord.ColumnCount => Fields.Count; IArrowArray IArrowRecord.Column(string columnName, IEqualityComparer comparer) => - _fields[((StructType)Data.DataType).GetFieldIndex(columnName, comparer)]; + Fields[((StructType)Data.DataType).GetFieldIndex(columnName, comparer)]; - IArrowArray IArrowRecord.Column(int columnIndex) => _fields[columnIndex]; + IArrowArray IArrowRecord.Column(int columnIndex) => Fields[columnIndex]; } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index 824694cd6d04b..e9c2d7a4d9b28 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.IO; namespace Apache.Arrow @@ -24,6 +25,9 @@ namespace Apache.Arrow /// stored as the number of seconds/ milliseconds (depending on the Time32Type) since midnight. /// public class Time32Array : PrimitiveArray +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { /// /// The class can be used to fluently build objects. @@ -155,6 +159,18 @@ public Time32Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") }; } + + int IReadOnlyCollection.Count => Length; + + TimeOnly? IReadOnlyList.this[int index] => GetTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTime(index); + }; + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index 3369893304414..fc18dfb8bf726 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.IO; namespace Apache.Arrow @@ -24,6 +25,9 @@ namespace Apache.Arrow /// stored as the number of microseconds/nanoseconds (depending on the Time64Type) since midnight. /// public class Time64Array : PrimitiveArray +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { /// /// The class can be used to fluently build objects. @@ -146,6 +150,18 @@ public Time64Array(ArrayData data) return new TimeOnly(((Time64Type)Data.DataType).Unit.ConvertToTicks(value.Value)); } + + int IReadOnlyCollection.Count => Length; + + TimeOnly? IReadOnlyList.this[int index] => GetTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTime(index); + }; + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index 0dc5726d01734..ccb656854a5df 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -15,12 +15,13 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.Diagnostics; using System.IO; namespace Apache.Arrow { - public class TimestampArray: PrimitiveArray + public class TimestampArray : PrimitiveArray, IReadOnlyList { private static readonly DateTimeOffset s_epoch = new DateTimeOffset(1970, 1, 1, 0, 0, 0, 0, TimeSpan.Zero); @@ -145,5 +146,16 @@ public DateTimeOffset GetTimestampUnchecked(int index) return GetTimestampUnchecked(index); } + int IReadOnlyCollection.Count => Length; + + DateTimeOffset? IReadOnlyList.this[int index] => GetTimestamp(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTimestamp(index); + }; + } } } diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 475d7ccc3ef28..c222dc0bca08b 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,8 +8,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 656ee6a2470e4..0de93b470a201 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 53fdd6d62dbcb..c227abbed4c5d 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 66becb84c5b66..5b36e369b1961 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,8 +15,8 @@ - - + + all runtime; build; native; contentfiles; analyzers diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index 96918ff091639..269c2390a70fa 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -14,6 +14,8 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using System.Numerics; using Xunit; @@ -93,6 +95,33 @@ void TestIsValid(ArrowBuffer valueBuf, ArrowBuffer nullBitmapBuf, int length, in } } + [Fact] + public void EnumerateArray() + { + var array = new Int64Array.Builder().Append(1).Append(2).Build(); + + foreach(long? foo in (IEnumerable)array) + { + Assert.InRange(foo.Value, 1, 2); + } + + foreach (object foo in (IEnumerable)array) + { + Assert.InRange((long)foo, 1, 2); + } + } + + [Fact] + public void ArrayAsReadOnlyList() + { + Int64Array array = new Int64Array.Builder().Append(1).Append(2).Build(); + var readOnlyList = (IReadOnlyList)array; + + Assert.Equal(array.Length, readOnlyList.Count); + Assert.Equal(readOnlyList[0], 1); + Assert.Equal(readOnlyList[1], 2); + } + #if NET5_0_OR_GREATER [Fact] public void SliceArray() diff --git a/csharp/test/Apache.Arrow.Tests/RecordTests.cs b/csharp/test/Apache.Arrow.Tests/RecordTests.cs index 09b0d2c6655ba..cfca4556b63a6 100644 --- a/csharp/test/Apache.Arrow.Tests/RecordTests.cs +++ b/csharp/test/Apache.Arrow.Tests/RecordTests.cs @@ -74,7 +74,25 @@ public void VisitStructAndBatch() StructArray level2Array = new StructArray(level2, stringArray.Length, new[] { level1Array }, nulls); RecordBatch batch = new RecordBatch(schema, new IArrowArray[] { level2Array }, stringArray.Length); + var visitor3 = new TestArrayVisitor1(); + visitor3.Visit(batch); + Assert.Equal("111utf8", visitor3.stringBuilder.ToString()); + var visitor4 = new TestArrayVisitor2(); + visitor4.Visit(batch); + Assert.Equal("322utf8", visitor4.stringBuilder.ToString()); + } + + [Fact] + public void LazyStructInitialization() + { + StringArray stringArray = new StringArray.Builder().Append("one").AppendNull().AppendNull().Append("four").Build(); + Field stringField = new Field("column1", StringType.Default, true); + StructType structType = new StructType(new[] { stringField }); + ArrayData structData = new ArrayData(structType, stringArray.Length, 0, 0, new[] { ArrowBuffer.Empty }, new[] { stringArray.Data }); + IArrowRecord structArray = new StructArray(structData); + Assert.Equal(1, structArray.ColumnCount); + Assert.Equal(structArray.Length, structArray.Column(0).Length); } private class TestTypeVisitor1 : IArrowTypeVisitor, IArrowTypeVisitor diff --git a/dev/archery/archery/integration/cdata.py b/dev/archery/archery/integration/cdata.py index 8e5550fcdb9c5..a5dbbe29d8aba 100644 --- a/dev/archery/archery/integration/cdata.py +++ b/dev/archery/archery/integration/cdata.py @@ -18,10 +18,19 @@ import cffi from contextlib import contextmanager import functools +import os +import sys from .tester import CDataExporter, CDataImporter +if sys.platform == "darwin": + dll_suffix = ".dylib" +elif os.name == "nt": + dll_suffix = ".dll" +else: + dll_suffix = ".so" + _c_data_decls = """ struct ArrowSchema { // Array type description diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py index 658e71330155e..2a47bc830886a 100644 --- a/dev/archery/archery/integration/tester_cpp.py +++ b/dev/archery/archery/integration/tester_cpp.py @@ -18,7 +18,6 @@ import contextlib import functools import os -import sys import subprocess from . import cdata @@ -42,15 +41,8 @@ "localhost", ] -if sys.platform == "darwin": - _dll_suffix = ".dylib" -elif os.name == "nt": - _dll_suffix = ".dll" -else: - _dll_suffix = ".so" - _DLL_PATH = _EXE_PATH -_ARROW_DLL = os.path.join(_DLL_PATH, "libarrow" + _dll_suffix) +_ARROW_DLL = os.path.join(_DLL_PATH, "libarrow" + cdata.dll_suffix) class CppTester(Tester): @@ -175,6 +167,7 @@ def make_c_data_importer(self): @functools.lru_cache def _load_ffi(ffi, lib_path=_ARROW_DLL): + os.environ['ARROW_DEBUG_MEMORY_POOL'] = 'trap' ffi.cdef(_cpp_c_data_entrypoints) dll = ffi.dlopen(lib_path) dll.ArrowCpp_CDataIntegration_ExportSchemaFromJson diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 7dca525673ba6..4f7765641130d 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -38,6 +38,7 @@ def _load_clr(): global _clr_loaded if not _clr_loaded: _clr_loaded = True + os.environ['DOTNET_GCHeapHardLimit'] = '0xC800000' # 200 MiB import pythonnet pythonnet.load("coreclr") import clr diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py index 2b3dc3a1be336..b59cd9d113291 100644 --- a/dev/archery/archery/integration/tester_go.py +++ b/dev/archery/archery/integration/tester_go.py @@ -18,7 +18,6 @@ import contextlib import functools import os -import sys import subprocess from . import cdata @@ -43,17 +42,10 @@ "localhost", ] -if sys.platform == "darwin": - _dll_suffix = ".dylib" -elif os.name == "nt": - _dll_suffix = ".dll" -else: - _dll_suffix = ".so" - _DLL_PATH = os.path.join( ARROW_ROOT_DEFAULT, "go/arrow/internal/cdata_integration") -_INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + _dll_suffix) +_INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + cdata.dll_suffix) class GoTester(Tester): @@ -167,6 +159,9 @@ def make_c_data_importer(self): @functools.lru_cache def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + # NOTE that setting Go environment variables here (such as GODEBUG) + # would be ignored by the Go runtime. The environment variables need + # to be set from the process calling Archery. ffi.cdef(_go_c_data_entrypoints) dll = ffi.dlopen(lib_path) return dll diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 5684798d794ad..d71479986c1da 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -34,11 +34,13 @@ def load_version_from_pom(): return version_tag.text -# XXX Should we add "-Darrow.memory.debug.allocator=true"? It adds a couple -# minutes to total CPU usage of the integration test suite. +# NOTE: we don't add "-Darrow.memory.debug.allocator=true" here as it adds a +# couple minutes to total CPU usage of the integration test suite +# (see setup_jpype() below). _JAVA_OPTS = [ "-Dio.netty.tryReflectionSetAccessible=true", "-Darrow.struct.conflict.policy=CONFLICT_APPEND", + "--add-opens=java.base/java.nio=ALL-UNNAMED", ] _arrow_version = load_version_from_pom() @@ -80,7 +82,12 @@ def setup_jpype(): jar_path = f"{_ARROW_TOOLS_JAR}:{_ARROW_C_DATA_JAR}" # XXX Didn't manage to tone down the logging level here (DEBUG -> INFO) jpype.startJVM(jpype.getDefaultJVMPath(), - "-Djava.class.path=" + jar_path, *_JAVA_OPTS) + "-Djava.class.path=" + jar_path, + # This flag is too heavy for IPC and Flight tests + "-Darrow.memory.debug.allocator=true", + # Reduce internal use of signals by the JVM + "-Xrs", + *_JAVA_OPTS) class _CDataBase: diff --git a/dev/archery/archery/integration/tester_rust.py b/dev/archery/archery/integration/tester_rust.py index c7a94de2197bd..56b07859dc82a 100644 --- a/dev/archery/archery/integration/tester_rust.py +++ b/dev/archery/archery/integration/tester_rust.py @@ -16,15 +16,19 @@ # under the License. import contextlib +import functools import os import subprocess -from .tester import Tester +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT -_EXE_PATH = os.path.join(ARROW_ROOT_DEFAULT, "rust/target/debug") +_EXE_PATH = os.environ.get( + "ARROW_RUST_EXE_PATH", os.path.join(ARROW_ROOT_DEFAULT, "rust/target/debug") +) _INTEGRATION_EXE = os.path.join(_EXE_PATH, "arrow-json-integration-test") _STREAM_TO_FILE = os.path.join(_EXE_PATH, "arrow-stream-to-file") _FILE_TO_STREAM = os.path.join(_EXE_PATH, "arrow-file-to-stream") @@ -37,12 +41,19 @@ "localhost", ] +_INTEGRATION_DLL = os.path.join(_EXE_PATH, + "libarrow_integration_testing" + cdata.dll_suffix) + class RustTester(Tester): PRODUCER = True CONSUMER = True FLIGHT_SERVER = True FLIGHT_CLIENT = True + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True name = 'Rust' @@ -117,3 +128,102 @@ def flight_request(self, port, json_path=None, scenario_name=None): if self.debug: log(' '.join(cmd)) run_cmd(cmd) + + def make_c_data_exporter(self): + return RustCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return RustCDataImporter(self.debug, self.args) + + +_rust_c_data_entrypoints = """ + const char* arrow_rs_cdata_integration_export_schema_from_json( + const char* json_path, uintptr_t out); + const char* arrow_rs_cdata_integration_import_schema_and_compare_to_json( + const char* json_path, uintptr_t c_schema); + + const char* arrow_rs_cdata_integration_export_batch_from_json( + const char* json_path, int num_batch, uintptr_t out); + const char* arrow_rs_cdata_integration_import_batch_and_compare_to_json( + const char* json_path, int num_batch, uintptr_t c_array); + + void arrow_rs_free_error(const char*); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_rust_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _pointer_to_int(self, c_ptr): + return self.ffi.cast('uintptr_t', c_ptr) + + def _check_rust_error(self, rs_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is dynamically allocated on the Rust side. + """ + assert self.ffi.typeof(rs_error) is self.ffi.typeof("const char*") + if rs_error != self.ffi.NULL: + try: + error = self.ffi.string(rs_error).decode( + 'utf8', errors='replace') + raise RuntimeError( + f"Rust C Data Integration call failed: {error}") + finally: + self.dll.arrow_rs_free_error(rs_error) + + +class RustCDataExporter(CDataExporter, _CDataBase): + + def export_schema_from_json(self, json_path, c_schema_ptr): + rs_error = self.dll.arrow_rs_cdata_integration_export_schema_from_json( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_rust_error(rs_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + rs_error = self.dll.arrow_rs_cdata_integration_export_batch_from_json( + str(json_path).encode(), num_batch, + self._pointer_to_int(c_array_ptr)) + self._check_rust_error(rs_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + # FIXME we should track the amount of Rust-allocated memory (GH-38822) + return 0 + + +class RustCDataImporter(CDataImporter, _CDataBase): + + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + rs_error = \ + self.dll.arrow_rs_cdata_integration_import_schema_and_compare_to_json( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_rust_error(rs_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, + c_array_ptr): + rs_error = \ + self.dll.arrow_rs_cdata_integration_import_batch_and_compare_to_json( + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr)) + self._check_rust_error(rs_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 15fac25d26d65..fe902071bb952 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -77,6 +77,9 @@ groups: c-glib: - test-*c-glib* + java: + - "*java*" + python: - test-*python* diff --git a/docker-compose.yml b/docker-compose.yml index e2c993ee9ea41..74f2c262aa5ff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1706,7 +1706,8 @@ services: args: repo: ${REPO} arch: ${ARCH} - jdk: ${JDK} + # Use a newer JDK as it seems to improve stability + jdk: 17 # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should # be set to ${MAVEN} maven: 3.5 @@ -1716,8 +1717,9 @@ services: environment: <<: [*common, *ccache] ARCHERY_INTEGRATION_WITH_RUST: 0 - # Tell Archery where the arrow C++ binaries are located + # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build && /arrow/ci/scripts/integration_arrow.sh /arrow /build"] diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 8b2a504631fdb..f9f44d5e97f89 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -32,9 +32,12 @@ Arrow Java uses the `Maven `_ build system. Building requires: -* JDK 8, 9, 10, 11, 17, or 18, but only JDK 8, 11 and 17 are tested in CI. +* JDK 8+ * Maven 3+ +.. note:: + CI will test all supported JDK LTS versions, plus the latest non-LTS version. + Building ======== diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e0884686acf6c..e2022171214b7 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -207,6 +207,10 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ +| ``+lv`` | list-view | | ++------------------------+---------------------------------------------------+------------+ +| ``+Lv`` | large list-view | | ++------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ | ``+s`` | struct | | @@ -243,6 +247,8 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. +* A ``large_list_view`` array has format string ``+Lv``, and its single + child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and ``f`` respectively. diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java index 60231a2460286..1525bcaaf51b1 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java @@ -17,7 +17,11 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +37,41 @@ public FixedSizeListAvaticaParameterConverter(ArrowType.FixedSizeList type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + final int arraySize = values.size(); + + if (vector instanceof FixedSizeListVector) { + FixedSizeListVector listVector = ((FixedSizeListVector) vector); + FieldVector childVector = listVector.getDataVector(); + int maxArraySize = listVector.getListSize(); + + if (arraySize != maxArraySize) { + if (!childVector.getField().isNullable()) { + throw new UnsupportedOperationException("Each array must contain " + maxArraySize + " elements"); + } else if (arraySize > maxArraySize) { + throw new UnsupportedOperationException("Each array must contain at most " + maxArraySize + " elements"); + } + } + + int startPos = listVector.startNewValue(index); + for (int i = 0; i < arraySize; i++) { + Object val = values.get(i); + int childIndex = startPos + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java index 6ef6920474860..a20747693e35a 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java @@ -17,7 +17,12 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; +import org.apache.arrow.memory.util.LargeMemoryUtil; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +38,32 @@ public LargeListAvaticaParameterConverter(ArrowType.LargeList type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + + if (vector instanceof LargeListVector) { + LargeListVector listVector = ((LargeListVector) vector); + FieldVector childVector = listVector.getDataVector(); + + long startPos = listVector.startNewValue(index); + for (int i = 0; i < values.size(); i++) { + Object val = values.get(i); + int childIndex = LargeMemoryUtil.checkedCastToInt(startPos) + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.endValue(index, values.size()); + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java index aec59cb4d428e..f6cb9f3be2a4c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java @@ -17,7 +17,11 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +37,32 @@ public ListAvaticaParameterConverter(ArrowType.List type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + + if (vector instanceof ListVector) { + ListVector listVector = ((ListVector) vector); + FieldVector childVector = listVector.getDataVector(); + + int startPos = listVector.startNewValue(index); + for (int i = 0; i < values.size(); i++) { + Object val = values.get(i); + int childIndex = startPos + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.endValue(index, values.size()); + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index 9e805fc79bcf8..5fa3ba38f2506 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -119,12 +119,22 @@ private void bind(FieldVector vector, TypedValue typedValue, int index) { } } - private static class BinderVisitor implements ArrowType.ArrowTypeVisitor { + /** + * ArrowTypeVisitor that binds Avatica TypedValues to the given FieldVector at the specified index. + */ + public static class BinderVisitor implements ArrowType.ArrowTypeVisitor { private final FieldVector vector; private final TypedValue typedValue; private final int index; - private BinderVisitor(FieldVector vector, TypedValue value, int index) { + /** + * Instantiate a new BinderVisitor. + * + * @param vector FieldVector to bind values to. + * @param value TypedValue to bind. + * @param index Vector index (0-based) to bind the value to. + */ + public BinderVisitor(FieldVector vector, TypedValue value, int index) { this.vector = vector; this.typedValue = value; this.index = index; diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java index b19f049544ada..0b521a704bb6a 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java @@ -27,6 +27,7 @@ import java.sql.SQLException; import java.util.Arrays; import java.util.Collections; +import java.util.List; import org.apache.arrow.driver.jdbc.utils.CoreMockedSqlProducers; import org.apache.arrow.driver.jdbc.utils.MockFlightSqlProducer; @@ -38,6 +39,7 @@ import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.Text; import org.junit.AfterClass; @@ -89,6 +91,14 @@ public void testSimpleQueryNoParameterBinding() throws SQLException { public void testQueryWithParameterBinding() throws SQLException { final String query = "Fake query with parameters"; final Schema schema = new Schema(Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))); + final Schema parameterSchema = new Schema(Arrays.asList( + Field.nullable("", ArrowType.Utf8.INSTANCE), + new Field("", FieldType.nullable(ArrowType.List.INSTANCE), + Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))))); + final List> expected = Collections.singletonList(Arrays.asList( + new Text("foo"), + new Integer[]{1, 2, null})); + PRODUCER.addSelectQuery(query, schema, Collections.singletonList(listener -> { try (final BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); @@ -105,11 +115,12 @@ public void testQueryWithParameterBinding() throws SQLException { } })); - PRODUCER.addExpectedParameters(query, - new Schema(Collections.singletonList(Field.nullable("", ArrowType.Utf8.INSTANCE))), - Collections.singletonList(Collections.singletonList(new Text("foo".getBytes(StandardCharsets.UTF_8))))); + PRODUCER.addExpectedParameters(query, parameterSchema, expected); + try (final PreparedStatement preparedStatement = connection.prepareStatement(query)) { preparedStatement.setString(1, "foo"); + preparedStatement.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{1, 2, null})); + try (final ResultSet resultSet = preparedStatement.executeQuery()) { resultSet.next(); assert true; @@ -171,17 +182,29 @@ public void testUpdateQueryWithParameters() throws SQLException { @Test public void testUpdateQueryWithBatchedParameters() throws SQLException { String query = "Fake update with batched parameters"; - PRODUCER.addUpdateQuery(query, /*updatedRows*/42); - PRODUCER.addExpectedParameters(query, - new Schema(Collections.singletonList(Field.nullable("", ArrowType.Utf8.INSTANCE))), + Schema parameterSchema = new Schema(Arrays.asList( + Field.nullable("", ArrowType.Utf8.INSTANCE), + new Field("", FieldType.nullable(ArrowType.List.INSTANCE), + Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))))); + List> expected = Arrays.asList( + Arrays.asList( + new Text("foo"), + new Integer[]{1, 2, null}), Arrays.asList( - Collections.singletonList(new Text("foo".getBytes(StandardCharsets.UTF_8))), - Collections.singletonList(new Text("bar".getBytes(StandardCharsets.UTF_8))))); + new Text("bar"), + new Integer[]{0, -1, 100000}) + ); + + PRODUCER.addUpdateQuery(query, /*updatedRows*/42); + PRODUCER.addExpectedParameters(query, parameterSchema, expected); + try (final PreparedStatement stmt = connection.prepareStatement(query)) { // TODO: make sure this is validated on the server too stmt.setString(1, "foo"); + stmt.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{1, 2, null})); stmt.addBatch(); stmt.setString(1, "bar"); + stmt.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{0, -1, 100000})); stmt.addBatch(); int[] updated = stmt.executeBatch(); assertEquals(42, updated[0]); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java index 2b65f8f5a07ba..eaba008fbfe77 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java @@ -29,6 +29,7 @@ import java.nio.channels.Channels; import java.nio.charset.StandardCharsets; import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -80,6 +81,7 @@ import org.apache.arrow.vector.ipc.WriteChannel; import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.JsonStringArrayList; import org.apache.calcite.avatica.Meta.StatementType; import com.google.protobuf.Any; @@ -373,7 +375,13 @@ private boolean validateParameters(String query, for (int paramIndex = 0; paramIndex < expectedRow.size(); paramIndex++) { Object expected = expectedRow.get(paramIndex); Object actual = root.getVector(paramIndex).getObject(i); - if (!Objects.equals(expected, actual)) { + boolean matches; + if (expected.getClass().isArray()) { + matches = Arrays.equals((Object[]) expected, ((JsonStringArrayList) actual).toArray()); + } else { + matches = Objects.equals(expected, actual); + } + if (!matches) { streamListener.onError(CallStatus.INVALID_ARGUMENT .withDescription(String.format("Parameter mismatch. Expected: %s Actual: %s", expected, actual)) .toRuntimeException()); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java index b83adf9271d4b..cc615c5b38321 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java @@ -141,8 +141,8 @@ public Object run() { // This exception will get swallowed, but it's necessary for the static analysis that ensures // the static fields above get initialized final RuntimeException failure = new RuntimeException( - "Failed to initialize MemoryUtil. Was Java started with " + - "`--add-opens=java.base/java.nio=ALL-UNNAMED`? " + + "Failed to initialize MemoryUtil. You must start Java with " + + "`--add-opens=java.base/java.nio=ALL-UNNAMED` " + "(See https://arrow.apache.org/docs/java/install.html)", e); failure.printStackTrace(); throw failure; diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 31aa058706a87..bd4151624dc34 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -606,6 +606,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): write_batch_size=self._properties["write_batch_size"], dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"], write_page_index=self._properties["write_page_index"], + write_page_checksum=self._properties["write_page_checksum"], ) def _set_arrow_properties(self): @@ -655,6 +656,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): dictionary_pagesize_limit=None, write_page_index=False, encryption_config=None, + write_page_checksum=False, ) self._set_properties() @@ -701,6 +703,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None If not None, use the provided ParquetDecryptionConfig to decrypt the Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. """ # Avoid mistakingly creating attributes @@ -711,7 +715,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): bint pre_buffer=True, thrift_string_size_limit=None, thrift_container_size_limit=None, - decryption_config=None): + decryption_config=None, + bint page_checksum_verification=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream @@ -723,6 +728,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_container_size_limit = thrift_container_size_limit if decryption_config is not None: self.parquet_decryption_config = decryption_config + self.page_checksum_verification = page_checksum_verification cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -802,6 +808,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): set_decryption_config(self, config) self._parquet_decryption_config = config + @property + def page_checksum_verification(self): + return self.reader_properties().page_checksum_verification() + + @page_checksum_verification.setter + def page_checksum_verification(self, bint page_checksum_verification): + self.reader_properties().set_page_checksum_verification(page_checksum_verification) + def equals(self, ParquetFragmentScanOptions other): """ Parameters @@ -814,11 +828,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): """ attrs = ( self.use_buffered_stream, self.buffer_size, self.pre_buffer, - self.thrift_string_size_limit, self.thrift_container_size_limit) + self.thrift_string_size_limit, self.thrift_container_size_limit, + self.page_checksum_verification) other_attrs = ( other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.thrift_string_size_limit, - other.thrift_container_size_limit) + other.thrift_container_size_limit, other.page_checksum_verification) return attrs == other_attrs @staticmethod @@ -835,6 +850,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): pre_buffer=self.pre_buffer, thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, + page_checksum_verification=self.page_checksum_verification ) return ParquetFragmentScanOptions._reconstruct, (kwargs,) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 39cdcc063b503..59b50ceda8c40 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -380,6 +380,9 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: shared_ptr[CFileDecryptionProperties] file_decryption_properties() \ const + c_bool page_checksum_verification() const + void set_page_checksum_verification(c_bool check_crc) + CReaderProperties default_reader_properties() cdef cppclass ArrowReaderProperties: @@ -428,6 +431,8 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit) Builder* enable_write_page_index() Builder* disable_write_page_index() + Builder* enable_page_checksum() + Builder* disable_page_checksum() shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -576,7 +581,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( FileEncryptionProperties encryption_properties=*, write_batch_size=*, dictionary_pagesize_limit=*, - write_page_index=*) except * + write_page_index=*, + write_page_checksum=*) except * cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 089ed7c75ce58..737ba9d0a8d55 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1183,7 +1183,8 @@ cdef class ParquetReader(_Weakrefable): coerce_int96_timestamp_unit=None, FileDecryptionProperties decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): """ Open a parquet file for reading. @@ -1199,6 +1200,7 @@ cdef class ParquetReader(_Weakrefable): decryption_properties : FileDecryptionProperties, optional thrift_string_size_limit : int, optional thrift_container_size_limit : int, optional + page_checksum_verification : bool, default False """ cdef: shared_ptr[CFileMetaData] c_metadata @@ -1236,6 +1238,8 @@ cdef class ParquetReader(_Weakrefable): arrow_props.set_pre_buffer(pre_buffer) + properties.set_page_checksum_verification(page_checksum_verification) + if coerce_int96_timestamp_unit is None: # use the default defined in default_arrow_reader_properties() pass @@ -1559,7 +1563,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( FileEncryptionProperties encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, - write_page_index=False) except *: + write_page_index=False, + write_page_checksum=False) except *: """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1703,6 +1708,13 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( # a size larger than this then it will be latched to this value. props.max_row_group_length(_MAX_ROW_GROUP_SIZE) + # checksum + + if write_page_checksum: + props.enable_page_checksum() + else: + props.disable_page_checksum() + # page index if write_page_index: @@ -1822,7 +1834,8 @@ cdef class ParquetWriter(_Weakrefable): write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, - write_page_index=False): + write_page_index=False, + write_page_checksum=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -1853,7 +1866,8 @@ cdef class ParquetWriter(_Weakrefable): encryption_properties=encryption_properties, write_batch_size=write_batch_size, dictionary_pagesize_limit=dictionary_pagesize_limit, - write_page_index=write_page_index + write_page_index=write_page_index, + write_page_checksum=write_page_checksum ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 5d20a4f8b72cb..ae52f5cf34e8b 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -823,10 +823,12 @@ cdef class RecordBatchReader(_Weakrefable): Parameters ---------- - requested_schema: Schema, default None - The schema to which the stream should be casted. Currently, this is - not supported and will raise a NotImplementedError if the schema - doesn't match the current schema. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 072ab7fa11745..096e960384784 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -280,6 +280,8 @@ class ParquetFile: If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. Examples -------- @@ -327,7 +329,8 @@ def __init__(self, source, *, metadata=None, common_metadata=None, read_dictionary=None, memory_map=False, buffer_size=0, pre_buffer=False, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None, filesystem=None): + thrift_container_size_limit=None, filesystem=None, + page_checksum_verification=False): self._close_source = getattr(source, 'closed', True) @@ -346,6 +349,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() @@ -887,6 +891,10 @@ def _sanitize_table(table, new_schema, flavor): filtering more efficient than the page header, as it gathers all the statistics for a Parquet file in a single place, avoiding scattered I/O. Note that the page index is not yet used on the read size by PyArrow. +write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. """ _parquet_writer_example_doc = """\ @@ -980,6 +988,7 @@ def __init__(self, where, schema, filesystem=None, dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, + write_page_checksum=False, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1037,6 +1046,7 @@ def __init__(self, where, schema, filesystem=None, dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, write_page_index=write_page_index, + write_page_checksum=write_page_checksum, **options) self.is_open = True @@ -1766,6 +1776,8 @@ class ParquetDataset: If not None, override the maximum total size of containers allocated when decoding Thrift structures. The default limit should be sufficient for most Parquet files. +page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. Examples -------- @@ -1779,7 +1791,8 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, use_legacy_dataset=None, pre_buffer=True, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): extra_msg = "" if use_legacy_dataset is None: @@ -1812,6 +1825,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata_nthreads=metadata_nthreads, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) warnings.warn( "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " @@ -1828,7 +1842,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, use_legacy_dataset=None, pre_buffer=True, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): if partitioning != "hive": raise ValueError( 'Only "hive" for hive-like partitioning is supported when ' @@ -2419,6 +2434,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, coerce_int96_timestamp_unit=None, schema=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, + page_checksum_verification=False, **kwargs): import pyarrow.dataset as ds @@ -2437,6 +2453,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, "thrift_string_size_limit": thrift_string_size_limit, "thrift_container_size_limit": thrift_container_size_limit, + "page_checksum_verification": page_checksum_verification, } if buffer_size: read_options.update(use_buffered_stream=True, @@ -2855,6 +2872,8 @@ def partitioning(self): If not None, override the maximum total size of containers allocated when decoding Thrift structures. The default limit should be sufficient for most Parquet files. +page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. Returns ------- @@ -2949,7 +2968,8 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): if not use_legacy_dataset: if metadata is not None: raise ValueError( @@ -2973,6 +2993,7 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset @@ -3004,6 +3025,7 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) return dataset.read(columns=columns, use_threads=use_threads, @@ -3020,6 +3042,11 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, "The 'ignore_prefixes' keyword is only supported when " "use_legacy_dataset=False") + if page_checksum_verification: + raise ValueError( + "The 'page_checksum_verification' keyword is only supported when " + "use_legacy_dataset=False") + if schema is not None: raise ValueError( "The 'schema' argument is only supported when " @@ -3101,6 +3128,7 @@ def write_table(table, where, row_group_size=None, version='2.6', dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, + write_page_checksum=False, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -3129,6 +3157,7 @@ def write_table(table, where, row_group_size=None, version='2.6', dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, write_page_index=write_page_index, + write_page_checksum=write_page_checksum, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 8ed5d4e216e8e..e979342b886da 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1350,6 +1350,8 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e55a0d1dd54cb..0fa913a21995b 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3039,9 +3039,12 @@ cdef class RecordBatch(_Tabular): Parameters ---------- - requested_schema : pyarrow.lib.Schema, default None - A schema to attempt to cast the streamed data to. This is currently - unsupported and will raise an error. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- @@ -4859,9 +4862,12 @@ cdef class Table(_Tabular): Parameters ---------- - requested_schema : pyarrow.lib.Schema, default None - A schema to attempt to cast the streamed data to. This is currently - unsupported and will raise an error. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index dd12a2661656a..26c52b1cc5939 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -18,6 +18,7 @@ from collections import OrderedDict import io import warnings +from shutil import copytree import numpy as np import pytest @@ -882,3 +883,134 @@ def test_thrift_size_limits(tempdir): assert got == table got = pq.read_table(path) assert got == table + + +def test_page_checksum_verification_write_table(tempdir): + """Check that checksum verification works for datasets created with + pq.write_table()""" + + # Write some sample data into a parquet file with page checksum enabled + original_path = tempdir / 'correct.parquet' + table_orig = pa.table({'a': [1, 2, 3, 4]}) + pq.write_table(table_orig, original_path, write_page_checksum=True) + + # Read file and verify that the data is correct + table_check = pq.read_table(original_path, page_checksum_verification=True) + assert table_orig == table_check + + # Read the original file as binary and swap the 31-th and 36-th bytes. This + # should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + bin_data = bytearray(original_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to another parquet file + corrupted_path = tempdir / 'corrupted.parquet' + corrupted_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with read_table() and without page + # checksum verification succeeds but yields corrupted data + table_corrupt = pq.read_table(corrupted_path, + page_checksum_verification=False) + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = pq.read_table(corrupted_path, page_checksum_verification=True) + + # Case 3: Reading the corrupted file with ParquetFile.read() and without + # page checksum verification succeeds but yields corrupted data + corrupted_pq_file = pq.ParquetFile(corrupted_path, + page_checksum_verification=False) + table_corrupt2 = corrupted_pq_file.read() + assert table_corrupt2 != table_orig + assert table_corrupt2 == pa.table({'a': [1, 3, 2, 4]}) + + # Case 4: Reading the corrupted file with ParquetFile.read() and with page + # checksum verification enabled raises an exception + corrupted_pq_file = pq.ParquetFile(corrupted_path, + page_checksum_verification=True) + # Accessing the data should result in an error + with pytest.raises(OSError, match="CRC checksum verification"): + _ = corrupted_pq_file.read() + + # Case 5: Check that enabling page checksum verification in combination + # with legacy dataset raises an exception + with pytest.raises(ValueError, match="page_checksum_verification"): + _ = pq.read_table(corrupted_path, + page_checksum_verification=True, + use_legacy_dataset=True) + + +@pytest.mark.dataset +@pytest.mark.parametrize( + "use_legacy_dataset", + [ + False, + pytest.param( + True, + marks=pytest.mark.filterwarnings( + "ignore:Passing 'use_legacy_dataset=True':FutureWarning" + ), + ), + ], +) +def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): + """Check that checksum verification works for datasets created with + pq.write_to_dataset""" + + table_orig = pa.table({'a': [1, 2, 3, 4]}) + + # Write a sample dataset with page checksum enabled + original_dir_path = tempdir / 'correct_dir' + pq.write_to_dataset(table_orig, + original_dir_path, + write_page_checksum=True, + use_legacy_dataset=use_legacy_dataset) + + # Read file and verify that the data is correct + original_file_path_list = list(original_dir_path.iterdir()) + assert len(original_file_path_list) == 1 + original_path = original_file_path_list[0] + table_check = pq.read_table(original_path, page_checksum_verification=True) + assert table_orig == table_check + + # Read the original file as binary and swap the 31-th and 36-th bytes. This + # should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + bin_data = bytearray(original_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to another parquet dataset + # Copy dataset dir (which should be just one file) + corrupted_dir_path = tempdir / 'corrupted_dir' + copytree(original_dir_path, corrupted_dir_path) + # Corrupt just the one file with the dataset + corrupted_file_path = corrupted_dir_path / original_path.name + corrupted_file_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with read_table() and without page + # checksum verification succeeds but yields corrupted data + table_corrupt = pq.read_table(corrupted_file_path, + page_checksum_verification=False) + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 5e6895c8dc24c..b902541015aa2 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -94,14 +94,14 @@ def test_validate_schema_write_table(tempdir): w.write_table(simple_table) -def test_parquet_invalid_writer(): +def test_parquet_invalid_writer(tempdir): # avoid segfaults with invalid construction with pytest.raises(TypeError): some_schema = pa.schema([pa.field("x", pa.int32())]) pq.ParquetWriter(None, some_schema) with pytest.raises(TypeError): - pq.ParquetWriter("some_path", None) + pq.ParquetWriter(tempdir / "some_path", None) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 6f3b54b0cd681..c6967326b3630 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -25,6 +25,7 @@ import tempfile import threading import time +from shutil import copytree from urllib.parse import quote @@ -788,12 +789,15 @@ def test_parquet_scan_options(): opts5 = ds.ParquetFragmentScanOptions( thrift_string_size_limit=123456, thrift_container_size_limit=987654,) + opts6 = ds.ParquetFragmentScanOptions( + page_checksum_verification=True) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 assert opts1.pre_buffer is True assert opts1.thrift_string_size_limit == 100_000_000 # default in C++ assert opts1.thrift_container_size_limit == 1_000_000 # default in C++ + assert opts1.page_checksum_verification is False assert opts2.use_buffered_stream is False assert opts2.buffer_size == 2**12 @@ -810,11 +814,14 @@ def test_parquet_scan_options(): assert opts5.thrift_string_size_limit == 123456 assert opts5.thrift_container_size_limit == 987654 + assert opts6.page_checksum_verification is True + assert opts1 == opts1 assert opts1 != opts2 assert opts2 != opts3 assert opts3 != opts4 assert opts5 != opts1 + assert opts6 != opts1 def test_file_format_pickling(pickle_module): @@ -5376,3 +5383,76 @@ def test_dataset_sort_by(tempdir, dstype): sorted_tab_dict = sorted_tab.to_table().to_pydict() assert sorted_tab_dict["a"] == [5, 7, 7, 35] assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"] + + +def test_checksum_write_dataset_read_dataset_to_table(tempdir): + """Check that checksum verification works for datasets created with + ds.write_dataset and read with ds.dataset.to_table""" + + table_orig = pa.table({'a': [1, 2, 3, 4]}) + + # Write a sample dataset with page checksum enabled + pq_write_format = pa.dataset.ParquetFileFormat() + write_options = pq_write_format.make_write_options( + write_page_checksum=True) + + original_dir_path = tempdir / 'correct_dir' + ds.write_dataset( + data=table_orig, + base_dir=original_dir_path, + format=pq_write_format, + file_options=write_options, + ) + + # Open dataset and verify that the data is correct + pq_scan_opts_crc = ds.ParquetFragmentScanOptions( + page_checksum_verification=True) + pq_read_format_crc = pa.dataset.ParquetFileFormat( + default_fragment_scan_options=pq_scan_opts_crc) + table_check = ds.dataset( + original_dir_path, + format=pq_read_format_crc + ).to_table() + assert table_orig == table_check + + # Copy dataset dir (which should be just one file) + corrupted_dir_path = tempdir / 'corrupted_dir' + copytree(original_dir_path, corrupted_dir_path) + + # Read the only file in the path as binary and swap the 31-th and 36-th + # bytes. This should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + corrupted_file_path_list = list(corrupted_dir_path.iterdir()) + assert len(corrupted_file_path_list) == 1 + corrupted_file_path = corrupted_file_path_list[0] + bin_data = bytearray(corrupted_file_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to the parquet file + corrupted_file_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with dataset().to_table() and without + # page checksum verification succeeds but yields corrupted data + pq_scan_opts_no_crc = ds.ParquetFragmentScanOptions( + page_checksum_verification=False) + pq_read_format_no_crc = pa.dataset.ParquetFileFormat( + default_fragment_scan_options=pq_scan_opts_no_crc) + table_corrupt = ds.dataset( + corrupted_dir_path, format=pq_read_format_no_crc).to_table() + + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = ds.dataset( + corrupted_dir_path, + format=pq_read_format_crc + ).to_table() diff --git a/r/configure b/r/configure index 4f09cfdc4419b..e48bd2f010b4a 100755 --- a/r/configure +++ b/r/configure @@ -413,7 +413,7 @@ CXX17FLAGS=`"${R_HOME}"/bin/R CMD config CXX17FLAGS` CXX17STD=`"${R_HOME}"/bin/R CMD config CXX17STD` CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS` TEST_CMD="${CXX17} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX17FLAGS} ${CXX17STD} -xc++ -" -echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} >/dev/null 2>&1 +TEST_ERROR=$(echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} -o /dev/null 2>&1) if [ $? -eq 0 ]; then # Prepend PKG_DIRS to PKG_LIBS and write to Makevars @@ -428,7 +428,12 @@ else echo "------------------------- NOTE ---------------------------" echo "There was an issue preparing the Arrow C++ libraries." echo "See https://arrow.apache.org/docs/r/articles/install.html" - echo "---------------------------------------------------------" + echo "----------------------------------------------------------" + echo "" + echo "Test compile error: ${TEST_ERROR}" + echo "Failing compile command: ${TEST_CMD}" + echo "PKG_CFLAGS=$PKG_CFLAGS" + echo "PKG_LIBS=$PKG_LIBS" PKG_LIBS="" PKG_CFLAGS="" exit 1 diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 03cbfbc5e91a8..b003e7cea80d4 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -99,16 +99,12 @@ download_binary <- function(lib) { libfile <- paste0("arrow-", VERSION, ".zip") binary_url <- paste0(arrow_repo, "bin/", lib, "/arrow-", VERSION, ".zip") if (try_download(binary_url, libfile)) { - if (!quietly) { - lg("Successfully retrieved C++ binaries (%s)", lib) - } + lg("Successfully retrieved libarrow (%s)", lib) } else { - if (!quietly) { lg( - "Downloading libarrow binary failed for version %s (%s)\n at %s", + "Downloading libarrow failed for version %s (%s)\n at %s", VERSION, lib, binary_url ) - } libfile <- NULL } # Explicitly setting the env var to "false" will skip checksum validation @@ -140,11 +136,11 @@ download_binary <- function(lib) { checksum_ok <- system2(checksum_cmd, args = checksum_args) if (checksum_ok != 0) { - cat("*** Checksum validation failed for libarrow binary: ", libfile, "\n") + lg("Checksum validation failed for libarrow: %s/%s", lib, libfile) unlink(libfile) libfile <- NULL } else { - cat("*** Checksum validated successfully for libarrow binary: ", libfile, "\n") + lg("Checksum validated successfully for libarrow: %s/%s", lib, libfile) } }