forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PARQUET-416: C++11 compilation, code reorg, libparquet and installati…
…on targets Reorganize code into a top level src/parquet directly, add a libparquet shared library, and add install targets for libparquet and its header files. Add cpplint script and `make lint` target for code linting. Replaces earlier PR apache#13 Author: Wes McKinney <[email protected]> Closes apache#14 from wesm/libparquet-library and squashes the following commits: 2e356fd [Wes McKinney] PARQUET-416: Compile with C++11 and replace usages of boost::shared_ptr with std::shared_ptr and other C++11 fixes. Reorganize code into a top level src/parquet directly, add a libparquet shared library, and add install targets for libparquet and its header files. Add cpplint script and `make lint` target for code linting. Change-Id: I4f9d8a35fc5878c621dfa94149dc5e99bf38e803
- Loading branch information
Showing
27 changed files
with
5,109 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Copyright 2015 Cloudera Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Headers: top level | ||
install(FILES | ||
parquet.h | ||
DESTINATION include/parquet) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright 2012 Cloudera Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
add_library(parquet_compression STATIC | ||
lz4-codec.cc | ||
snappy-codec.cc | ||
) | ||
target_link_libraries(parquet_compression | ||
lz4static | ||
snappystatic) | ||
|
||
set_target_properties(parquet_compression | ||
PROPERTIES | ||
LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") | ||
|
||
# Headers: compression | ||
install(FILES | ||
codec.h | ||
DESTINATION include/parquet/compression) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// Copyright 2012 Cloudera Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef PARQUET_COMPRESSION_CODEC_H | ||
#define PARQUET_COMPRESSION_CODEC_H | ||
|
||
#include "parquet/parquet.h" | ||
|
||
#include <cstdint> | ||
#include "parquet/thrift/parquet_constants.h" | ||
#include "parquet/thrift/parquet_types.h" | ||
|
||
namespace parquet_cpp { | ||
|
||
class Codec { | ||
public: | ||
virtual ~Codec() {} | ||
virtual void Decompress(int input_len, const uint8_t* input, | ||
int output_len, uint8_t* output_buffer) = 0; | ||
|
||
virtual int Compress(int input_len, const uint8_t* input, | ||
int output_buffer_len, uint8_t* output_buffer) = 0; | ||
|
||
virtual int MaxCompressedLen(int input_len, const uint8_t* input) = 0; | ||
|
||
virtual const char* name() const = 0; | ||
}; | ||
|
||
|
||
// Snappy codec. | ||
class SnappyCodec : public Codec { | ||
public: | ||
virtual void Decompress(int input_len, const uint8_t* input, | ||
int output_len, uint8_t* output_buffer); | ||
|
||
virtual int Compress(int input_len, const uint8_t* input, | ||
int output_buffer_len, uint8_t* output_buffer); | ||
|
||
virtual int MaxCompressedLen(int input_len, const uint8_t* input); | ||
|
||
virtual const char* name() const { return "snappy"; } | ||
}; | ||
|
||
// Lz4 codec. | ||
class Lz4Codec : public Codec { | ||
public: | ||
virtual void Decompress(int input_len, const uint8_t* input, | ||
int output_len, uint8_t* output_buffer); | ||
|
||
virtual int Compress(int input_len, const uint8_t* input, | ||
int output_buffer_len, uint8_t* output_buffer); | ||
|
||
virtual int MaxCompressedLen(int input_len, const uint8_t* input); | ||
|
||
virtual const char* name() const { return "lz4"; } | ||
}; | ||
|
||
} // namespace parquet_cpp | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// Copyright 2012 Cloudera Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "parquet/compression/codec.h" | ||
|
||
#include <lz4.h> | ||
|
||
namespace parquet_cpp { | ||
|
||
void Lz4Codec::Decompress(int input_len, const uint8_t* input, | ||
int output_len, uint8_t* output_buffer) { | ||
int n = LZ4_uncompress(reinterpret_cast<const char*>(input), | ||
reinterpret_cast<char*>(output_buffer), output_len); | ||
if (n != input_len) { | ||
throw parquet_cpp::ParquetException("Corrupt lz4 compressed data."); | ||
} | ||
} | ||
|
||
int Lz4Codec::MaxCompressedLen(int input_len, const uint8_t* input) { | ||
return LZ4_compressBound(input_len); | ||
} | ||
|
||
int Lz4Codec::Compress(int input_len, const uint8_t* input, | ||
int output_buffer_len, uint8_t* output_buffer) { | ||
return LZ4_compress(reinterpret_cast<const char*>(input), | ||
reinterpret_cast<char*>(output_buffer), input_len); | ||
} | ||
|
||
} // namespace parquet_cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
// Copyright 2012 Cloudera Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "parquet/compression/codec.h" | ||
|
||
#include <snappy.h> | ||
|
||
namespace parquet_cpp { | ||
|
||
void SnappyCodec::Decompress(int input_len, const uint8_t* input, | ||
int output_len, uint8_t* output_buffer) { | ||
if (!snappy::RawUncompress(reinterpret_cast<const char*>(input), | ||
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer))) { | ||
throw parquet_cpp::ParquetException("Corrupt snappy compressed data."); | ||
} | ||
} | ||
|
||
int SnappyCodec::MaxCompressedLen(int input_len, const uint8_t* input) { | ||
return snappy::MaxCompressedLength(input_len); | ||
} | ||
|
||
int SnappyCodec::Compress(int input_len, const uint8_t* input, | ||
int output_buffer_len, uint8_t* output_buffer) { | ||
size_t output_len; | ||
snappy::RawCompress(reinterpret_cast<const char*>(input), | ||
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer), | ||
&output_len); | ||
return output_len; | ||
} | ||
|
||
} // namespace parquet_cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Copyright 2015 Cloudera Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Headers: encodings | ||
install(FILES | ||
encodings.h | ||
bool-encoding.h | ||
delta-bit-pack-encoding.h | ||
delta-byte-array-encoding.h | ||
delta-length-byte-array-encoding.h | ||
dictionary-encoding.h | ||
plain-encoding.h | ||
DESTINATION include/parquet/encodings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Copyright 2012 Cloudera Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef PARQUET_BOOL_ENCODING_H | ||
#define PARQUET_BOOL_ENCODING_H | ||
|
||
#include "parquet/encodings/encodings.h" | ||
|
||
#include <algorithm> | ||
|
||
namespace parquet_cpp { | ||
|
||
class BoolDecoder : public Decoder { | ||
public: | ||
BoolDecoder() : Decoder(parquet::Type::BOOLEAN, parquet::Encoding::PLAIN) { } | ||
|
||
virtual void SetData(int num_values, const uint8_t* data, int len) { | ||
num_values_ = num_values; | ||
decoder_ = RleDecoder(data, len, 1); | ||
} | ||
|
||
virtual int GetBool(bool* buffer, int max_values) { | ||
max_values = std::min(max_values, num_values_); | ||
for (int i = 0; i < max_values; ++i) { | ||
if (!decoder_.Get(&buffer[i])) ParquetException::EofException(); | ||
} | ||
num_values_ -= max_values; | ||
return max_values; | ||
} | ||
|
||
private: | ||
RleDecoder decoder_; | ||
}; | ||
|
||
} // namespace parquet_cpp | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// Copyright 2012 Cloudera Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef PARQUET_DELTA_BIT_PACK_ENCODING_H | ||
#define PARQUET_DELTA_BIT_PACK_ENCODING_H | ||
|
||
#include "parquet/encodings/encodings.h" | ||
|
||
#include <algorithm> | ||
#include <vector> | ||
|
||
namespace parquet_cpp { | ||
|
||
class DeltaBitPackDecoder : public Decoder { | ||
public: | ||
explicit DeltaBitPackDecoder(const parquet::Type::type& type) | ||
: Decoder(type, parquet::Encoding::DELTA_BINARY_PACKED) { | ||
if (type != parquet::Type::INT32 && type != parquet::Type::INT64) { | ||
throw ParquetException("Delta bit pack encoding should only be for integer data."); | ||
} | ||
} | ||
|
||
virtual void SetData(int num_values, const uint8_t* data, int len) { | ||
num_values_ = num_values; | ||
decoder_ = BitReader(data, len); | ||
values_current_block_ = 0; | ||
values_current_mini_block_ = 0; | ||
} | ||
|
||
virtual int GetInt32(int32_t* buffer, int max_values) { | ||
return GetInternal(buffer, max_values); | ||
} | ||
|
||
virtual int GetInt64(int64_t* buffer, int max_values) { | ||
return GetInternal(buffer, max_values); | ||
} | ||
|
||
private: | ||
void InitBlock() { | ||
uint64_t block_size; | ||
if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); | ||
if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); | ||
if (!decoder_.GetVlqInt(&values_current_block_)) { | ||
ParquetException::EofException(); | ||
} | ||
if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); | ||
delta_bit_widths_.resize(num_mini_blocks_); | ||
|
||
if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException(); | ||
for (int i = 0; i < num_mini_blocks_; ++i) { | ||
if (!decoder_.GetAligned<uint8_t>(1, &delta_bit_widths_[i])) { | ||
ParquetException::EofException(); | ||
} | ||
} | ||
values_per_mini_block_ = block_size / num_mini_blocks_; | ||
mini_block_idx_ = 0; | ||
delta_bit_width_ = delta_bit_widths_[0]; | ||
values_current_mini_block_ = values_per_mini_block_; | ||
} | ||
|
||
template <typename T> | ||
int GetInternal(T* buffer, int max_values) { | ||
max_values = std::min(max_values, num_values_); | ||
for (int i = 0; i < max_values; ++i) { | ||
if (UNLIKELY(values_current_mini_block_ == 0)) { | ||
++mini_block_idx_; | ||
if (mini_block_idx_ < delta_bit_widths_.size()) { | ||
delta_bit_width_ = delta_bit_widths_[mini_block_idx_]; | ||
values_current_mini_block_ = values_per_mini_block_; | ||
} else { | ||
InitBlock(); | ||
buffer[i] = last_value_; | ||
continue; | ||
} | ||
} | ||
|
||
// TODO: the key to this algorithm is to decode the entire miniblock at once. | ||
int64_t delta; | ||
if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException(); | ||
delta += min_delta_; | ||
last_value_ += delta; | ||
buffer[i] = last_value_; | ||
--values_current_mini_block_; | ||
} | ||
num_values_ -= max_values; | ||
return max_values; | ||
} | ||
|
||
BitReader decoder_; | ||
uint64_t values_current_block_; | ||
uint64_t num_mini_blocks_; | ||
uint64_t values_per_mini_block_; | ||
uint64_t values_current_mini_block_; | ||
|
||
int64_t min_delta_; | ||
int mini_block_idx_; | ||
std::vector<uint8_t> delta_bit_widths_; | ||
int delta_bit_width_; | ||
|
||
int64_t last_value_; | ||
}; | ||
|
||
} // namespace parquet_cpp | ||
|
||
#endif |
Oops, something went wrong.