Skip to content

Commit

Permalink
PARQUET-518: Remove -Wno-sign-compare and scrub integer signedness
Browse files Browse the repository at this point in the history
This patch removes compiler warning suppresses, fixes signed-unsigned integer comparisons, and scrubs most usages of `size_t` from the codebase in favor of signed integer types.

Author: Wes McKinney <[email protected]>

Closes apache#63 from wesm/PARQUET-518 and squashes the following commits:

ba74e14 [Wes McKinney] Fix unsigned int comparison after rebase
b6adc51 [Wes McKinney] Scrub more usages of size_t
242ca3f [Wes McKinney] Disable -Wno-sign-compare and do some scrubbing

Change-Id: I749dba3f989a0b198924b2a449451091b09595b1
  • Loading branch information
wesm authored and julienledem committed Mar 1, 2016
1 parent 960754f commit 61900cf
Show file tree
Hide file tree
Showing 25 changed files with 122 additions and 124 deletions.
8 changes: 4 additions & 4 deletions cpp/src/parquet/column/column-reader-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ class TestPrimitiveReader : public ::testing::Test {
vector<int32_t> vresult(num_values_, -1);
vector<int16_t> dresult(num_levels_, -1);
vector<int16_t> rresult(num_levels_, -1);
size_t values_read = 0;
size_t total_values_read = 0;
size_t batch_actual = 0;
int64_t values_read = 0;
int total_values_read = 0;
int batch_actual = 0;

Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
int32_t batch_size = 8;
size_t batch = 0;
int batch = 0;
// This will cover both the cases
// 1) batch_size < page_size (multiple ReadBatch from a single page)
// 2) batch_size > page_size (BatchRead limits to a single page)
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/parquet/column/levels.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ class LevelEncoder {
}

// Encodes a batch of levels from an array and returns the number of levels encoded
size_t Encode(size_t batch_size, const int16_t* levels) {
size_t num_encoded = 0;
int Encode(int batch_size, const int16_t* levels) {
int num_encoded = 0;
if (!rle_encoder_ && !bit_packed_encoder_) {
throw ParquetException("Level encoders are not initialized.");
}

if (encoding_ == Encoding::RLE) {
for (size_t i = 0; i < batch_size; ++i) {
for (int i = 0; i < batch_size; ++i) {
if (!rle_encoder_->Put(*(levels + i))) {
break;
}
Expand All @@ -68,7 +68,7 @@ class LevelEncoder {
rle_encoder_->Flush();
rle_length_ = rle_encoder_->len();
} else {
for (size_t i = 0; i < batch_size; ++i) {
for (int i = 0; i < batch_size; ++i) {
if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
break;
}
Expand Down Expand Up @@ -101,7 +101,7 @@ class LevelDecoder {

// Initialize the LevelDecoder state with new data
// and return the number of bytes consumed
size_t SetData(Encoding::type encoding, int16_t max_level,
int SetData(Encoding::type encoding, int16_t max_level,
int num_buffered_values, const uint8_t* data) {
uint32_t num_bytes = 0;
uint32_t total_bytes = 0;
Expand Down Expand Up @@ -135,19 +135,19 @@ class LevelDecoder {
}

// Decodes a batch of levels into an array and returns the number of levels decoded
size_t Decode(size_t batch_size, int16_t* levels) {
size_t num_decoded = 0;
int Decode(int batch_size, int16_t* levels) {
int num_decoded = 0;

size_t num_values = std::min(num_values_remaining_, batch_size);
int num_values = std::min(num_values_remaining_, batch_size);
if (encoding_ == Encoding::RLE) {
for (size_t i = 0; i < num_values; ++i) {
for (int i = 0; i < num_values; ++i) {
if (!rle_decoder_->Get(levels + i)) {
break;
}
++num_decoded;
}
} else {
for (size_t i = 0; i < num_values; ++i) {
for (int i = 0; i < num_values; ++i) {
if (!bit_packed_decoder_->GetValue(bit_width_, levels + i)) {
break;
}
Expand All @@ -160,7 +160,7 @@ class LevelDecoder {

private:
int bit_width_;
size_t num_values_remaining_;
int num_values_remaining_;
Encoding::type encoding_;
std::unique_ptr<RleDecoder> rle_decoder_;
std::unique_ptr<BitReader> bit_packed_decoder_;
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/column/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,13 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
// If the data page includes repetition and definition levels, we
// initialize the level decoder and subtract the encoded level bytes from
// the page size to determine the number of bytes in the encoded data.
size_t data_size = page->size();
int64_t data_size = page->size();

//Data page Layout: Repetition Levels - Definition Levels - encoded values.
//Levels are encoded as rle or bit-packed.
//Init repetition levels
if (descr_->max_repetition_level() > 0) {
size_t rep_levels_bytes = repetition_level_decoder_.SetData(
int64_t rep_levels_bytes = repetition_level_decoder_.SetData(
page->repetition_level_encoding(), descr_->max_repetition_level(),
num_buffered_values_, buffer);
buffer += rep_levels_bytes;
Expand All @@ -113,7 +113,7 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {

//Init definition levels
if (descr_->max_definition_level() > 0) {
size_t def_levels_bytes = definition_level_decoder_.SetData(
int64_t def_levels_bytes = definition_level_decoder_.SetData(
page->definition_level_encoding(), descr_->max_definition_level(),
num_buffered_values_, buffer);
buffer += def_levels_bytes;
Expand Down Expand Up @@ -165,14 +165,14 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
// ----------------------------------------------------------------------
// Batch read APIs

size_t ColumnReader::ReadDefinitionLevels(size_t batch_size, int16_t* levels) {
int64_t ColumnReader::ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
if (descr_->max_definition_level() == 0) {
return 0;
}
return definition_level_decoder_.Decode(batch_size, levels);
}

size_t ColumnReader::ReadRepetitionLevels(size_t batch_size, int16_t* levels) {
int64_t ColumnReader::ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
if (descr_->max_repetition_level() == 0) {
return 0;
}
Expand Down
28 changes: 14 additions & 14 deletions cpp/src/parquet/column/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ class ColumnReader {
// Read multiple definition levels into preallocated memory
//
// Returns the number of decoded definition levels
size_t ReadDefinitionLevels(size_t batch_size, int16_t* levels);
int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels);

// Read multiple repetition levels into preallocated memory
//
// Returns the number of decoded repetition levels
size_t ReadRepetitionLevels(size_t batch_size, int16_t* levels);
int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels);

const ColumnDescriptor* descr_;

Expand Down Expand Up @@ -122,8 +122,8 @@ class TypedColumnReader : public ColumnReader {
// This API is the same for both V1 and V2 of the DataPage
//
// @returns: actual number of levels read (see values_read for number of values read)
size_t ReadBatch(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
T* values, size_t* values_read);
int64_t ReadBatch(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
T* values, int64_t* values_read);

private:
typedef Decoder<TYPE> DecoderType;
Expand All @@ -135,7 +135,7 @@ class TypedColumnReader : public ColumnReader {
// pre-allocated memory T*
//
// @returns: the number of values read into the out buffer
size_t ReadValues(size_t batch_size, T* out);
int64_t ReadValues(int64_t batch_size, T* out);

// Map of encoding type to the respective decoder object. For example, a
// column chunk's data pages may include both dictionary-encoded and
Expand All @@ -149,14 +149,14 @@ class TypedColumnReader : public ColumnReader {


template <int TYPE>
inline size_t TypedColumnReader<TYPE>::ReadValues(size_t batch_size, T* out) {
size_t num_decoded = current_decoder_->Decode(out, batch_size);
inline int64_t TypedColumnReader<TYPE>::ReadValues(int64_t batch_size, T* out) {
int64_t num_decoded = current_decoder_->Decode(out, batch_size);
return num_decoded;
}

template <int TYPE>
inline size_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_levels,
int16_t* rep_levels, T* values, size_t* values_read) {
inline int64_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_levels,
int16_t* rep_levels, T* values, int64_t* values_read) {
// HasNext invokes ReadNewPage
if (!HasNext()) {
*values_read = 0;
Expand All @@ -167,17 +167,17 @@ inline size_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_le
// row group is finished
batch_size = std::min(batch_size, num_buffered_values_);

size_t num_def_levels = 0;
size_t num_rep_levels = 0;
int64_t num_def_levels = 0;
int64_t num_rep_levels = 0;

size_t values_to_read = 0;
int64_t values_to_read = 0;

// If the field is required and non-repeated, there are no definition levels
if (descr_->max_definition_level() > 0) {
num_def_levels = ReadDefinitionLevels(batch_size, def_levels);
// TODO(wesm): this tallying of values-to-decode can be performed with better
// cache-efficiency if fused with the level decoding.
for (size_t i = 0; i < num_def_levels; ++i) {
for (int64_t i = 0; i < num_def_levels; ++i) {
if (def_levels[i] == descr_->max_definition_level()) {
++values_to_read;
}
Expand All @@ -196,7 +196,7 @@ inline size_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_le
}

*values_read = ReadValues(values_to_read, values);
size_t total_values = std::max(num_def_levels, *values_read);
int64_t total_values = std::max(num_def_levels, *values_read);
num_decoded_values_ += total_values;

return total_values;
Expand Down
13 changes: 5 additions & 8 deletions cpp/src/parquet/column/scanner-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ class TestFlatScanner : public ::testing::Test {
bool is_null = false;
int16_t def_level;
int16_t rep_level;
size_t j = 0;
int j = 0;
scanner->SetBatchSize(batch_size);
for (size_t i = 0; i < num_levels_; i++) {
for (int i = 0; i < num_levels_; i++) {
ASSERT_TRUE(scanner->Next(&val, &def_level, &rep_level, &is_null)) << i << j;
if (!is_null) {
ASSERT_EQ(values_[j++], val) << i <<"V"<< j;
Expand Down Expand Up @@ -193,15 +193,15 @@ template<>
void TestFlatScanner<ByteArrayType>::InitValues() {
int max_byte_array_len = 12;
int num_bytes = max_byte_array_len + sizeof(uint32_t);
size_t nbytes = num_values_ * num_bytes;
int nbytes = num_values_ * num_bytes;
data_buffer_.resize(nbytes);
random_byte_array(num_values_, 0, data_buffer_.data(), values_.data(),
max_byte_array_len);
}

template<>
void TestFlatScanner<FLBAType>::InitValues() {
size_t nbytes = num_values_ * FLBA_LENGTH;
int nbytes = num_values_ * FLBA_LENGTH;
data_buffer_.resize(nbytes);
random_fixed_byte_array(num_values_, 0, data_buffer_.data(), FLBA_LENGTH,
values_.data());
Expand Down Expand Up @@ -259,10 +259,9 @@ TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
InitScanner(&d);
TypedScanner<FLBAType::type_num>* scanner =
reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
size_t j = 0;
scanner->SetBatchSize(batch_size);
std::stringstream ss_fail;
for (size_t i = 0; i < num_levels_; i++) {
for (int i = 0; i < num_levels_; i++) {
std::stringstream ss;
scanner->PrintNext(ss, 17);
std::string result = ss.str();
Expand All @@ -271,7 +270,5 @@ TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
ASSERT_THROW(scanner->PrintNext(ss_fail, 17), ParquetException);
}

//Test for GroupNode

} // namespace test
} // namespace parquet_cpp
20 changes: 10 additions & 10 deletions cpp/src/parquet/column/scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ class Scanner {

std::vector<int16_t> def_levels_;
std::vector<int16_t> rep_levels_;
size_t level_offset_;
size_t levels_buffered_;
int level_offset_;
int levels_buffered_;

std::vector<uint8_t> value_buffer_;
size_t value_offset_;
size_t values_buffered_;
int value_offset_;
int64_t values_buffered_;

private:
std::shared_ptr<ColumnReader> reader_;
Expand All @@ -96,7 +96,7 @@ class TypedScanner : public Scanner {
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE) :
Scanner(reader, batch_size) {
typed_reader_ = static_cast<TypedColumnReader<TYPE>*>(reader.get());
size_t value_byte_size = type_traits<TYPE>::value_byte_size;
int value_byte_size = type_traits<TYPE>::value_byte_size;
value_buffer_.resize(batch_size_ * value_byte_size);
values_ = reinterpret_cast<T*>(&value_buffer_[0]);
}
Expand Down Expand Up @@ -190,38 +190,38 @@ class TypedScanner : public Scanner {
// The ownership of this object is expressed through the reader_ variable in the base
TypedColumnReader<TYPE>* typed_reader_;

inline void FormatValue(void* val, char* buffer, size_t bufsize, size_t width);
inline void FormatValue(void* val, char* buffer, int bufsize, int width);

T* values_;
};


template <int TYPE>
inline void TypedScanner<TYPE>::FormatValue(void* val, char* buffer,
size_t bufsize, size_t width) {
int bufsize, int width) {
std::string fmt = format_fwf<TYPE>(width);
snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
}

template <>
inline void TypedScanner<Type::INT96>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::INT96>(width);
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}

template <>
inline void TypedScanner<Type::BYTE_ARRAY>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::BYTE_ARRAY>(width);
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}

template <>
inline void TypedScanner<Type::FIXED_LEN_BYTE_ARRAY>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::FIXED_LEN_BYTE_ARRAY>(width);
std::string result = FixedLenByteArrayToString(
*reinterpret_cast<FixedLenByteArray*>(val),
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/column/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class MockPageReader : public PageReader {

// Implement the PageReader interface
virtual std::shared_ptr<Page> NextPage() {
if (page_index_ == pages_.size()) {
if (page_index_ == static_cast<int>(pages_.size())) {
// EOS to consumer
return std::shared_ptr<Page>(nullptr);
}
Expand All @@ -56,7 +56,7 @@ class MockPageReader : public PageReader {

private:
std::vector<std::shared_ptr<Page> > pages_;
size_t page_index_;
int page_index_;
};

// TODO(wesm): this is only used for testing for now. Refactor to form part of
Expand Down Expand Up @@ -102,7 +102,7 @@ class DataPageBuilder {
if (encoding != Encoding::PLAIN) {
ParquetException::NYI("only plain encoding currently implemented");
}
size_t bytes_to_encode = values.size() * sizeof(T);
int bytes_to_encode = values.size() * sizeof(T);

PlainEncoder<TYPE> encoder(d);
encoder.Encode(&values[0], values.size(), sink_);
Expand Down Expand Up @@ -171,7 +171,7 @@ void DataPageBuilder<Type::BOOLEAN>::AppendValues(const ColumnDescriptor *d,
if (encoding != Encoding::PLAIN) {
ParquetException::NYI("only plain encoding currently implemented");
}
size_t bytes_to_encode = values.size() * sizeof(bool);
int bytes_to_encode = values.size() * sizeof(bool);

PlainEncoder<Type::BOOLEAN> encoder(d);
encoder.Encode(values, values.size(), sink_);
Expand All @@ -186,7 +186,7 @@ static std::shared_ptr<DataPage> MakeDataPage(const ColumnDescriptor *d,
const std::vector<T>& values,
const std::vector<int16_t>& def_levels, int16_t max_def_level,
const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
size_t num_values = values.size();
int num_values = values.size();

InMemoryOutputStream page_stream;
test::DataPageBuilder<TYPE> page_builder(&page_stream);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/encodings/delta-bit-pack-encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
uint64_t values_current_mini_block_;

int32_t min_delta_;
int mini_block_idx_;
size_t mini_block_idx_;
std::vector<uint8_t> delta_bit_widths_;
int delta_bit_width_;

Expand Down
Loading

0 comments on commit 61900cf

Please sign in to comment.