From 9c0066c4f7431828a7ab119ebf17f6efa9f47dfd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 2 Feb 2016 15:00:45 -0800 Subject: [PATCH] PARQUET-454: Fix inconsistencies with boolean PLAIN encoding Requires PARQUET-485 (#32) The boolean Encoding::PLAIN code path was using RleDecoder, inconsistent with other implementations of Parquet. This patch adds an implementation of plain encoding and uses BitReader instead of RleDecoder to decode plain-encoded boolean data. Unit tests to verify. Also closes PR #12. Thanks to @edani for reporting. Author: Wes McKinney Closes #34 from wesm/PARQUET-454 and squashes the following commits: 01cb5a7 [Wes McKinney] Use a seed in the data generation 0bf5d8a [Wes McKinney] Fix inconsistencies with boolean PLAIN encoding. Change-Id: I1be5252c654d4864d14c3cdd70d63c507e0a9403 --- cpp/src/parquet/encodings/CMakeLists.txt | 2 + cpp/src/parquet/encodings/encodings.h | 3 +- .../parquet/encodings/plain-encoding-test.cc | 65 +++++++++++++++++++ cpp/src/parquet/encodings/plain-encoding.h | 51 ++++++++++++--- cpp/src/parquet/util/bit-util.h | 8 +++ cpp/src/parquet/util/test-common.h | 27 ++++++++ 6 files changed, 144 insertions(+), 12 deletions(-) create mode 100644 cpp/src/parquet/encodings/plain-encoding-test.cc diff --git a/cpp/src/parquet/encodings/CMakeLists.txt b/cpp/src/parquet/encodings/CMakeLists.txt index 69ef1f3e28fd1..638fba0f66ccc 100644 --- a/cpp/src/parquet/encodings/CMakeLists.txt +++ b/cpp/src/parquet/encodings/CMakeLists.txt @@ -24,3 +24,5 @@ install(FILES dictionary-encoding.h plain-encoding.h DESTINATION include/parquet/encodings) + +ADD_PARQUET_TEST(plain-encoding-test) diff --git a/cpp/src/parquet/encodings/encodings.h b/cpp/src/parquet/encodings/encodings.h index 0d9202eeb3ad4..c427ff4f1e4e2 100644 --- a/cpp/src/parquet/encodings/encodings.h +++ b/cpp/src/parquet/encodings/encodings.h @@ -20,10 +20,9 @@ #include +#include "parquet/exception.h" #include "parquet/types.h" -#include "parquet/thrift/parquet_constants.h" -#include "parquet/thrift/parquet_types.h" #include "parquet/util/rle-encoding.h" #include "parquet/util/bit-stream-utils.inline.h" diff --git a/cpp/src/parquet/encodings/plain-encoding-test.cc b/cpp/src/parquet/encodings/plain-encoding-test.cc new file mode 100644 index 0000000000000..ca425dd3bf242 --- /dev/null +++ b/cpp/src/parquet/encodings/plain-encoding-test.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include "parquet/util/test-common.h" + +#include "parquet/encodings/encodings.h" + +using std::string; +using std::vector; + +namespace parquet_cpp { + +namespace test { + +TEST(BooleanTest, TestEncodeDecode) { + // PARQUET-454 + + size_t nvalues = 100; + size_t nbytes = BitUtil::RoundUp(nvalues, 8) / 8; + + // seed the prng so failure is deterministic + vector draws = flip_coins_seed(nvalues, 0.5, 0); + + PlainEncoder encoder(nullptr); + PlainDecoder decoder(nullptr); + + std::vector encode_buffer(nbytes); + + size_t encoded_bytes = encoder.Encode(draws, nvalues, &encode_buffer[0]); + ASSERT_EQ(nbytes, encoded_bytes); + + std::vector decode_buffer(nbytes); + const uint8_t* decode_data = &decode_buffer[0]; + + decoder.SetData(nvalues, &encode_buffer[0], encoded_bytes); + size_t values_decoded = decoder.Decode(&decode_buffer[0], nvalues); + ASSERT_EQ(nvalues, values_decoded); + + for (size_t i = 0; i < nvalues; ++i) { + ASSERT_EQ(BitUtil::GetArrayBit(decode_data, i), draws[i]) << i; + } +} + +} // namespace test + +} // namespace parquet_cpp diff --git a/cpp/src/parquet/encodings/plain-encoding.h b/cpp/src/parquet/encodings/plain-encoding.h index 11e70c7506124..cc377761a0687 100644 --- a/cpp/src/parquet/encodings/plain-encoding.h +++ b/cpp/src/parquet/encodings/plain-encoding.h @@ -21,6 +21,7 @@ #include "parquet/encodings/encodings.h" #include +#include using parquet::Type; @@ -103,19 +104,38 @@ class PlainDecoder : public Decoder { virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; - decoder_ = RleDecoder(data, len, 1); + bit_reader_ = BitReader(data, len); + } + + // Two flavors of bool decoding + int Decode(uint8_t* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + bool val; + for (int i = 0; i < max_values; ++i) { + if (!bit_reader_.GetValue(1, &val)) { + ParquetException::EofException(); + } + BitUtil::SetArrayBit(buffer, i, val); + } + num_values_ -= max_values; + return max_values; } virtual int Decode(bool* buffer, int max_values) { max_values = std::min(max_values, num_values_); + bool val; for (int i = 0; i < max_values; ++i) { - if (!decoder_.Get(&buffer[i])) ParquetException::EofException(); + if (!bit_reader_.GetValue(1, &val)) { + ParquetException::EofException(); + } + buffer[i] = val; } num_values_ -= max_values; return max_values; } + private: - RleDecoder decoder_; + BitReader bit_reader_; }; // ---------------------------------------------------------------------- @@ -132,6 +152,24 @@ class PlainEncoder : public Encoder { virtual size_t Encode(const T* src, int num_values, uint8_t* dst); }; +template <> +class PlainEncoder : public Encoder { + public: + explicit PlainEncoder(const parquet::SchemaElement* schema) : + Encoder(schema, parquet::Encoding::PLAIN) {} + + virtual size_t Encode(const std::vector& src, int num_values, + uint8_t* dst) { + size_t bytes_required = BitUtil::RoundUp(num_values, 8) / 8; + BitWriter bit_writer(dst, bytes_required); + for (size_t i = 0; i < num_values; ++i) { + bit_writer.PutValue(src[i], 1); + } + bit_writer.Flush(); + return bit_writer.bytes_written(); + } +}; + template inline size_t PlainEncoder::Encode(const T* buffer, int num_values, uint8_t* dst) { @@ -140,13 +178,6 @@ inline size_t PlainEncoder::Encode(const T* buffer, int num_values, return nbytes; } -template <> -inline size_t PlainEncoder::Encode( - const bool* src, int num_values, uint8_t* dst) { - ParquetException::NYI("bool encoding"); - return 0; -} - template <> inline size_t PlainEncoder::Encode(const ByteArray* src, int num_values, uint8_t* dst) { diff --git a/cpp/src/parquet/util/bit-util.h b/cpp/src/parquet/util/bit-util.h index 7a2e9212faf79..eac53467f87f8 100644 --- a/cpp/src/parquet/util/bit-util.h +++ b/cpp/src/parquet/util/bit-util.h @@ -270,6 +270,14 @@ class BitUtil { return v | (static_cast(0x1) << bitpos); } + static inline bool GetArrayBit(const uint8_t* bits, size_t i) { + return bits[i / 8] & (1 << (i % 8)); + } + + static inline void SetArrayBit(uint8_t* bits, size_t i, bool is_set) { + bits[i / 8] |= (1 << (i % 8)) * is_set; + } + // Set a specific bit to 0 // Behavior when bitpos is negative is undefined template diff --git a/cpp/src/parquet/util/test-common.h b/cpp/src/parquet/util/test-common.h index 38bc32c5c844e..3cf82f5961683 100644 --- a/cpp/src/parquet/util/test-common.h +++ b/cpp/src/parquet/util/test-common.h @@ -19,6 +19,7 @@ #define PARQUET_UTIL_TEST_COMMON_H #include +#include #include using std::vector; @@ -46,6 +47,32 @@ static inline bool vector_equal(const vector& left, const vector& right) { return true; } +static inline vector flip_coins_seed(size_t n, double p, uint32_t seed) { + std::mt19937 gen(seed); + std::bernoulli_distribution d(p); + + vector draws; + for (size_t i = 0; i < n; ++i) { + draws.push_back(d(gen)); + } + return draws; +} + + +static inline vector flip_coins(size_t n, double p) { + std::random_device rd; + std::mt19937 gen(rd()); + + std::bernoulli_distribution d(p); + + vector draws; + for (size_t i = 0; i < n; ++i) { + draws.push_back(d(gen)); + } + return draws; +} + + } // namespace test } // namespace parquet_cpp