Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36882: [C++][Parquet] Use RLE as BOOLEAN default encoding when both data page and version is V2 #38163

Merged
merged 9 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2336,7 +2336,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
Encoding::type encoding = properties->encoding(descr->path());
if (encoding == Encoding::UNKNOWN) {
encoding = (descr->physical_type() == Type::BOOLEAN &&
properties->version() != ParquetVersion::PARQUET_1_0)
properties->version() != ParquetVersion::PARQUET_1_0 &&
properties->data_page_version() == ParquetDataPageVersion::V2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, if the data page version is v2, it is user's responsibility to set parquet version to PARQUET_2_X. Unfortunately, this is not enforced yet.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we prevent from this or just keep the behavior here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's do it separately.

? Encoding::RLE
: Encoding::PLAIN;
}
Expand Down
46 changes: 31 additions & 15 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
int64_t output_size = SMALL_SIZE,
const ColumnProperties& column_properties = ColumnProperties(),
const ParquetVersion::type version = ParquetVersion::PARQUET_1_0,
const ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1,
bool enable_checksum = false) {
sink_ = CreateOutputStream();
WriterProperties::Builder wp_builder;
wp_builder.version(version);
wp_builder.version(version)->data_page_version(data_page_version);
if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY ||
column_properties.encoding() == Encoding::RLE_DICTIONARY) {
wp_builder.enable_dictionary();
Expand Down Expand Up @@ -176,7 +177,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum));
}

void TestDictionaryFallbackEncoding(ParquetVersion::type version) {
void TestDictionaryFallbackEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version) {
this->GenerateData(VERY_LARGE_SIZE);
ColumnProperties column_properties;
column_properties.set_dictionary_enabled(true);
Expand All @@ -187,7 +189,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
column_properties.set_encoding(Encoding::RLE_DICTIONARY);
}

auto writer = this->BuildWriter(VERY_LARGE_SIZE, column_properties, version);
auto writer =
this->BuildWriter(VERY_LARGE_SIZE, column_properties, version, data_page_version);

writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
writer->Close();
Expand All @@ -204,13 +207,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
std::set<Encoding::type> expected;
if (version == ParquetVersion::PARQUET_1_0) {
if (version != ParquetVersion::PARQUET_1_0 &&
data_page_version == ParquetDataPageVersion::V2) {
// There is only 1 encoding (RLE) in a fallback case for version 2.0 and data page
// v2 enabled.
expected = {Encoding::RLE};
} else {
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for
// version 1.0. Note that RLE is used for DL/RL.
// version 1.0 or data page v1. Note that RLE is used for DL/RL.
expected = {Encoding::PLAIN, Encoding::RLE};
} else {
// There is only 1 encoding (RLE) in a fallback case for version 2.0
expected = {Encoding::RLE};
}
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
Expand Down Expand Up @@ -262,8 +267,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
writer->Close();
Expand All @@ -281,8 +287,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatchSpaced(this->values_.size(), nullptr, nullptr, valid_bits.data(), 0,
this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
Expand Down Expand Up @@ -767,9 +774,12 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) {

class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
public:
void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) {
void TestWithEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version,
Encoding::type encoding) {
this->SetUpSchema(Repetition::REQUIRED);
auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
ParquetDataPageVersion::V1,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not data_page_version?

/*enable_checksum*/ false);
for (int i = 0; i < SMALL_SIZE; i++) {
bool value = (i % 2 == 0) ? true : false;
Expand All @@ -789,12 +799,18 @@ class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
// PARQUET-764
// Correct bitpacking for boolean write at non-byte boundaries
TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
for (auto data_page_version :
{ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, data_page_version, Encoding::PLAIN);
}
}

// Default encoding for boolean is RLE when using V2 pages
TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V1,
Encoding::PLAIN);
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V2,
Encoding::RLE);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, RLE is always in the encodings regardless, since it's used to RL/DL, right?

}

// PARQUET-979
Expand Down
Loading