From 006f38e48d386300ffbcdb5f4136dcb98f1765d8 Mon Sep 17 00:00:00 2001 From: Nong Li Date: Tue, 2 Feb 2016 14:50:00 -0800 Subject: [PATCH] PARQUET-503: Reenable parquet 2.0 encoding implementations. Author: Nong Li Closes #35 from nongli/parquet-503 and squashes the following commits: cb2a4e1 [Nong Li] PARQUET-503: Reenable parquet 2.0 encoding implementations. Change-Id: Id3801ddb44164bcc63adc3ee83250d33c1d7e191 --- .../encodings/delta-bit-pack-encoding.h | 10 +++---- cpp/src/parquet/encodings/encodings.h | 8 +++--- cpp/src/parquet/util/bit-stream-utils.h | 12 +++++---- .../parquet/util/bit-stream-utils.inline.h | 26 +++++++++---------- cpp/src/parquet/util/bit-util-test.cc | 19 ++++++++++++++ 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/cpp/src/parquet/encodings/delta-bit-pack-encoding.h b/cpp/src/parquet/encodings/delta-bit-pack-encoding.h index a0833b5997a1e..858fcec1b6824 100644 --- a/cpp/src/parquet/encodings/delta-bit-pack-encoding.h +++ b/cpp/src/parquet/encodings/delta-bit-pack-encoding.h @@ -54,7 +54,7 @@ class DeltaBitPackDecoder : public Decoder { using Decoder::num_values_; void InitBlock() { - uint64_t block_size; + int32_t block_size; if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&values_current_block_)) { @@ -104,17 +104,17 @@ class DeltaBitPackDecoder : public Decoder { } BitReader decoder_; - uint64_t values_current_block_; - uint64_t num_mini_blocks_; + int32_t values_current_block_; + int32_t num_mini_blocks_; uint64_t values_per_mini_block_; uint64_t values_current_mini_block_; - int64_t min_delta_; + int32_t min_delta_; int mini_block_idx_; std::vector delta_bit_widths_; int delta_bit_width_; - int64_t last_value_; + int32_t last_value_; }; } // namespace parquet_cpp diff --git a/cpp/src/parquet/encodings/encodings.h b/cpp/src/parquet/encodings/encodings.h index 4fb3d9a9c5a7a..0d9202eeb3ad4 100644 --- a/cpp/src/parquet/encodings/encodings.h +++ b/cpp/src/parquet/encodings/encodings.h @@ -105,10 +105,8 @@ class Encoder { #include "parquet/encodings/plain-encoding.h" #include "parquet/encodings/dictionary-encoding.h" - -// The encoding tools changed and these are missing the ZigZag functions -// #include "parquet/encodings/delta-bit-pack-encoding.h" -// #include "parquet/encodings/delta-length-byte-array-encoding.h" -// #include "parquet/encodings/delta-byte-array-encoding.h" +#include "parquet/encodings/delta-bit-pack-encoding.h" +#include "parquet/encodings/delta-length-byte-array-encoding.h" +#include "parquet/encodings/delta-byte-array-encoding.h" #endif // PARQUET_ENCODINGS_ENCODINGS_H diff --git a/cpp/src/parquet/util/bit-stream-utils.h b/cpp/src/parquet/util/bit-stream-utils.h index a02839dc3b438..3e8f95c65dd6f 100644 --- a/cpp/src/parquet/util/bit-stream-utils.h +++ b/cpp/src/parquet/util/bit-stream-utils.h @@ -69,7 +69,10 @@ class BitWriter { /// room. The value is written byte aligned. /// For more details on vlq: /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(int32_t v); + bool PutVlqInt(uint32_t v); + + // Writes an int zigzag encoded. + bool PutZigZagVlqInt(int32_t v); /// Get a pointer to the next aligned byte and advance the underlying buffer /// by num_bytes. @@ -135,6 +138,9 @@ class BitReader { /// the buffer. bool GetVlqInt(int32_t* v); + // Reads a zigzag encoded int `into` v. + bool GetZigZagVlqInt(int32_t* v); + /// Returns the number of bytes left in the stream, not including the current /// byte (i.e., there may be an additional fraction of a byte). int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } @@ -142,10 +148,6 @@ class BitReader { /// Maximum byte length of a vlq encoded int static const int MAX_VLQ_BYTE_LEN = 5; - // TODO(nongli): implementations to be fixed given changes in Impala - // bool GetZigZagVlqInt(int64_t* v); - // bool PutZigZagVlqInt(int32_t v); - private: const uint8_t* buffer_; int max_bytes_; diff --git a/cpp/src/parquet/util/bit-stream-utils.inline.h b/cpp/src/parquet/util/bit-stream-utils.inline.h index 77e2d48817110..e0dcab871fc54 100644 --- a/cpp/src/parquet/util/bit-stream-utils.inline.h +++ b/cpp/src/parquet/util/bit-stream-utils.inline.h @@ -75,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) { return true; } -inline bool BitWriter::PutVlqInt(int32_t v) { +inline bool BitWriter::PutVlqInt(uint32_t v) { bool result = true; while ((v & 0xFFFFFF80) != 0L) { result &= PutAligned((v & 0x7F) | 0x80, 1); @@ -152,20 +152,18 @@ inline bool BitReader::GetVlqInt(int32_t* v) { return true; } -// TODO(nongli): review/test these implementations given divergence in Impala -// functions - -// inline bool BitWriter::PutZigZagVlqInt(int32_t v) { -// uint32_t u = (v << 1) ^ (v >> 31); -// return PutVlqInt(u); -// } +inline bool BitWriter::PutZigZagVlqInt(int32_t v) { + uint32_t u = (v << 1) ^ (v >> 31); + return PutVlqInt(u); +} -// inline bool BitReader::GetZigZagVlqInt(int64_t* v) { -// uint64_t u; -// if (!GetVlqInt(&u)) return false; -// *reinterpret_cast(v) = (u >> 1) ^ -(u & 1); -// return true; -// } +inline bool BitReader::GetZigZagVlqInt(int32_t* v) { + int32_t u_signed; + if (!GetVlqInt(&u_signed)) return false; + uint32_t u = static_cast(u_signed); + *reinterpret_cast(v) = (u >> 1) ^ -(u & 1); + return true; +} } // namespace parquet_cpp diff --git a/cpp/src/parquet/util/bit-util-test.cc b/cpp/src/parquet/util/bit-util-test.cc index 78efe1a85536e..a8b6be09bb661 100644 --- a/cpp/src/parquet/util/bit-util-test.cc +++ b/cpp/src/parquet/util/bit-util-test.cc @@ -26,6 +26,7 @@ #include #include "parquet/util/bit-util.h" +#include "parquet/util/bit-stream-utils.inline.h" #include "parquet/util/cpu-info.h" namespace parquet_cpp { @@ -161,4 +162,22 @@ TEST(BitUtil, RoundUpDown) { EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1); } +void TestZigZag(int32_t v) { + uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN]; + BitWriter writer(buffer, sizeof(buffer)); + BitReader reader(buffer, sizeof(buffer)); + writer.PutZigZagVlqInt(v); + int32_t result; + EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); + EXPECT_EQ(v, result); +} + +TEST(BitStreamUtil, ZigZag) { + TestZigZag(0); + TestZigZag(1); + TestZigZag(-1); + TestZigZag(std::numeric_limits::max()); + TestZigZag(-std::numeric_limits::max()); +} + } // namespace parquet_cpp