Skip to content

Commit

Permalink
add basic test
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed Apr 10, 2024
1 parent b762bf7 commit bb2e9cf
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 14 deletions.
2 changes: 2 additions & 0 deletions cpp/src/parquet/page_index_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -852,4 +852,6 @@ TEST_F(PageIndexBuilderTest, TwoRowGroups) {
CheckOffsetIndex(/*row_group=*/1, /*column=*/1, page_locations[1][1], final_position);
}

// TODO: add test for size stats

} // namespace parquet
165 changes: 151 additions & 14 deletions cpp/src/parquet/size_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@

#include <gtest/gtest.h>

#include "arrow/io/file.h"
#include "arrow/util/float16.h"
#include "parquet/file_reader.h"
#include "parquet/metadata.h"
#include "arrow/buffer.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/bit_util.h"
#include "parquet/schema.h"
#include "parquet/test_util.h"
#include "parquet/thrift_internal.h"
#include "parquet/types.h"

namespace parquet {

Expand All @@ -37,13 +37,9 @@ using namespace parquet::schema;
TEST(SizeStatistics, WriteBatchLevels) {
std::vector<int64_t> expected_def_level_histogram = {256, 128, 64, 32, 16, 8, 4, 2, 2};
std::vector<int64_t> expected_rep_level_histogram = {256, 128, 64, 32, 32};

const int16_t max_def_level =
static_cast<int16_t>(expected_def_level_histogram.size()) - 1;
const int16_t max_rep_level =
static_cast<int16_t>(expected_rep_level_histogram.size()) - 1;
auto descr =
std::make_unique<ColumnDescriptor>(Int32("a"), max_def_level, max_rep_level);
constexpr int16_t kMaxDefLevel = 8;
constexpr int16_t kMaxRefLevel = 4;
auto descr = std::make_unique<ColumnDescriptor>(Int32("a"), kMaxDefLevel, kMaxRefLevel);
auto builder = SizeStatisticsBuilder::Make(descr.get());

auto write_batch_levels =
Expand Down Expand Up @@ -98,8 +94,149 @@ TEST(SizeStatistics, WriteRepeatedLevels) {
std::vector<int64_t>({55, 65, 95, 145}));
}

// TODO: Add tests for write binary variants.
// TODO: Add tests for merge two size statistics.
// TODO: Add tests for thrift serialization.
TEST(SizeStatistics, WriteDenseByteArrayValues) {
constexpr std::string_view kValue = "foo";
constexpr int kNumValues = 1000;
constexpr int kBatchSize = 64;
const std::vector<parquet::ByteArray> values(kNumValues, kValue);

auto descr = std::make_unique<ColumnDescriptor>(
schema::ByteArray("a"), /*max_def_level=*/0, /*max_rep_level=*/0);
auto builder = SizeStatisticsBuilder::Make(descr.get());
for (int i = 0; i < kNumValues; i += kBatchSize) {
auto batch_size = std::min(kBatchSize, kNumValues - i);
builder->WriteValues(values.data() + i, batch_size);
}

auto size_statistics = builder->Build();
EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1),
kNumValues * kValue.size());
}

TEST(SizeStatistics, WriteSpacedByteArrayValues) {
constexpr std::string_view kValue = "foo";
constexpr int kNumValues = 1000;
constexpr int kBatchSize = 63;
const std::vector<parquet::ByteArray> values(kNumValues, kValue);
ASSERT_OK_AND_ASSIGN(auto not_null_bitmap, ::arrow::AllocateBitmap(kNumValues));
int not_null_count = 0;
for (int i = 0; i < kNumValues; i++) {
if (i % 3 == 0) {
::arrow::bit_util::ClearBit(not_null_bitmap->mutable_data(), i);
} else {
::arrow::bit_util::SetBit(not_null_bitmap->mutable_data(), i);
not_null_count++;
}
}

auto descr = std::make_unique<ColumnDescriptor>(
schema::ByteArray("a"), /*max_def_level=*/1, /*max_rep_level=*/0);
auto builder = SizeStatisticsBuilder::Make(descr.get());
for (int i = 0; i < kNumValues; i += kBatchSize) {
auto batch_size = std::min(kBatchSize, kNumValues - i);
builder->WriteValuesSpaced(values.data() + i, not_null_bitmap->data(), i, batch_size);
}

auto size_statistics = builder->Build();
EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1),
not_null_count * kValue.size());
}

TEST(SizeStatistics, WriteBinaryArray) {
std::vector<std::shared_ptr<::arrow::Array>> arrays = {
::arrow::ArrayFromJSON(::arrow::binary(), R"(["foo", null, "bar", "baz"])"),
::arrow::ArrayFromJSON(::arrow::large_binary(), R"(["foo", null, "bar", "baz"])"),
};
for (const auto& array : arrays) {
auto descr = std::make_unique<ColumnDescriptor>(
schema::ByteArray("a"), /*max_def_level=*/1, /*max_rep_level=*/0);
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteValues(*array);
auto size_statistics = builder->Build();
EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1), 9);
}
}

TEST(SizeStatistics, MergeStatistics) {
constexpr int kNumValues = 16;
const std::array<int16_t, kNumValues> def_levels = {0, 0, 0, 0, 1, 1, 1, 1,
2, 2, 2, 2, 3, 3, 3, 3};
const std::array<int16_t, kNumValues> rep_levels = {0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3};
const std::vector<int64_t> expected_histogram = {8, 8, 8, 8};
constexpr std::string_view kByteArrayValue = "foo";
const std::vector<parquet::ByteArray> values(kNumValues,
parquet::ByteArray{kByteArrayValue});

for (const auto& descr :
{std::make_unique<ColumnDescriptor>(schema::Int32("a"), /*max_def_level=*/3,
/*max_rep_level=*/3),
std::make_unique<ColumnDescriptor>(schema::ByteArray("a"), /*max_def_level=*/3,
/*max_rep_level=*/3)}) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
}
auto size_statistics_1 = builder->Build();

builder->Reset();
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
}
auto size_statistics_2 = builder->Build();

size_statistics_1->Merge(*size_statistics_2);
EXPECT_EQ(size_statistics_1->definition_level_histogram(), expected_histogram);
EXPECT_EQ(size_statistics_1->repetition_level_histogram(), expected_histogram);
if (descr->physical_type() == Type::BYTE_ARRAY) {
EXPECT_TRUE(size_statistics_1->unencoded_byte_array_data_bytes().has_value());
EXPECT_EQ(size_statistics_1->unencoded_byte_array_data_bytes().value(),
kByteArrayValue.size() * kNumValues * 2);
} else {
EXPECT_FALSE(size_statistics_1->unencoded_byte_array_data_bytes().has_value());
}
}
}

TEST(SizeStatistics, ThriftSerDe) {
constexpr int kNumValues = 16;
const std::array<int16_t, kNumValues> def_levels = {0, 0, 0, 0, 1, 1, 1, 1,
2, 2, 2, 2, 3, 3, 3, 3};
const std::array<int16_t, kNumValues> rep_levels = {0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3};
const std::vector<int64_t> expected_histogram = {4, 4, 4, 4};
constexpr std::string_view kByteArrayValue = "foo";
const std::vector<parquet::ByteArray> values(kNumValues,
parquet::ByteArray{kByteArrayValue});

for (const auto& descr :
{std::make_unique<ColumnDescriptor>(schema::Int32("a"), /*max_def_level=*/3,
/*max_rep_level=*/3),
std::make_unique<ColumnDescriptor>(schema::ByteArray("a"), /*max_def_level=*/3,
/*max_rep_level=*/3)}) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
}
auto size_statistics = builder->Build();
auto thrift_statistics = ToThrift(*size_statistics);
auto restored_statistics = SizeStatistics::Make(&thrift_statistics, descr.get());
EXPECT_EQ(restored_statistics->definition_level_histogram(), expected_histogram);
EXPECT_EQ(restored_statistics->repetition_level_histogram(), expected_histogram);
if (descr->physical_type() == Type::BYTE_ARRAY) {
EXPECT_TRUE(restored_statistics->unencoded_byte_array_data_bytes().has_value());
EXPECT_EQ(restored_statistics->unencoded_byte_array_data_bytes().value(),
kByteArrayValue.size() * kNumValues);
} else {
EXPECT_FALSE(restored_statistics->unencoded_byte_array_data_bytes().has_value());
}
}
}

} // namespace parquet

0 comments on commit bb2e9cf

Please sign in to comment.