diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 656ab7bd3b805..6b1c09efb4d8d 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -80,6 +80,10 @@ function setup_sanitizers() { TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" export TSAN_OPTIONS + UBSAN_OPTIONS="$UBSAN_OPTIONS print_stacktrace=1" + UBSAN_OPTIONS="$UBSAN_OPTIONS suppressions=$ROOT/build-support/ubsan-suppressions.txt" + export UBSAN_OPTIONS + # Enable leak detection even under LLVM 3.4, where it was disabled by default. # This flag only takes effect when running an ASAN build. # ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" diff --git a/cpp/build-support/tsan-suppressions.txt b/cpp/build-support/tsan-suppressions.txt new file mode 100644 index 0000000000000..ce897c8591188 --- /dev/null +++ b/cpp/build-support/tsan-suppressions.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Thread leak in CUDA +thread:libcuda.so diff --git a/cpp/build-support/ubsan-suppressions.txt b/cpp/build-support/ubsan-suppressions.txt new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/cpp/build-support/ubsan-suppressions.txt @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2f4f5d16364f1..efc8ad82faf93 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -324,7 +324,15 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r right_data = right.values()->data() + right.offset() * byte_width; } - if (left.null_count() > 0) { + if (byte_width == 0) { + // Special case 0-width data, as the data pointers may be null + for (int64_t i = 0; i < left.length(); ++i) { + if (left.IsNull(i) != right.IsNull(i)) { + return false; + } + } + return true; + } else if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { const bool left_null = left.IsNull(i); const bool right_null = right.IsNull(i); diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 15746d4c9965e..092aebc8c3d2e 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -404,6 +404,7 @@ struct is_float_truncate< template struct CastFunctor::value>::type> { + ARROW_DISABLE_UBSAN("float-cast-overflow") void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { using in_type = typename I::c_type; diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc index 28cbad47580e8..1f37046798fd7 100644 --- a/cpp/src/arrow/csv/column-builder.cc +++ b/cpp/src/arrow/csv/column-builder.cc @@ -305,12 +305,12 @@ Status InferringColumnBuilder::TryConvertChunk(size_t chunk_index) { void InferringColumnBuilder::Insert(int64_t block_index, const std::shared_ptr& parser) { - DCHECK_NE(converter_, nullptr); - // Create a slot for the new chunk and spawn a task to convert it size_t chunk_index = static_cast(block_index); { std::lock_guard lock(mutex_); + + DCHECK_NE(converter_, nullptr); if (chunks_.size() <= chunk_index) { chunks_.resize(chunk_index + 1); } diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 6d780c0940eba..f329ae9d504e5 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -468,10 +468,10 @@ class MyMemoryPool : public MemoryPool { int64_t bytes_allocated() const override { return -1; } - int64_t num_allocations() const { return num_allocations_; } + int64_t num_allocations() const { return num_allocations_.load(); } private: - int64_t num_allocations_; + std::atomic num_allocations_; }; TEST_F(TestReadableFile, CustomMemoryPool) { diff --git a/cpp/src/arrow/io/readahead-test.cc b/cpp/src/arrow/io/readahead-test.cc index b7f404f666983..6575e898590d8 100644 --- a/cpp/src/arrow/io/readahead-test.cc +++ b/cpp/src/arrow/io/readahead-test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,51 @@ using internal::checked_cast; namespace io { namespace internal { +class LockedInputStream : public InputStream { + public: + explicit LockedInputStream(const std::shared_ptr& stream) + : stream_(stream) {} + + Status Close() override { + std::lock_guard lock(mutex_); + return stream_->Close(); + } + + bool closed() const override { + std::lock_guard lock(mutex_); + return stream_->closed(); + } + + Status Tell(int64_t* position) const override { + std::lock_guard lock(mutex_); + return stream_->Tell(position); + } + + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, bytes_read, buffer); + } + + Status Read(int64_t nbytes, std::shared_ptr* out) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, out); + } + + bool supports_zero_copy() const override { + std::lock_guard lock(mutex_); + return stream_->supports_zero_copy(); + } + + util::string_view Peek(int64_t nbytes) const override { + std::lock_guard lock(mutex_); + return stream_->Peek(nbytes); + } + + protected: + std::shared_ptr stream_; + mutable std::mutex mutex_; +}; + static void sleep_for(double seconds) { std::this_thread::sleep_for( std::chrono::nanoseconds(static_cast(seconds * 1e9))); @@ -57,13 +103,13 @@ static void busy_wait(double seconds, std::function predicate) { } } -std::shared_ptr DataReader(const std::string& data) { +std::shared_ptr DataReader(const std::string& data) { std::shared_ptr buffer; ABORT_NOT_OK(Buffer::FromString(data, &buffer)); - return std::make_shared(buffer); + return std::make_shared(std::make_shared(buffer)); } -static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, +static int64_t WaitForPosition(const FileInterface& file, int64_t expected, double seconds = 0.2) { int64_t pos = -1; busy_wait(seconds, [&]() -> bool { @@ -73,12 +119,12 @@ static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, return pos; } -static void AssertEventualPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertEventualPosition(const FileInterface& file, int64_t expected) { int64_t pos = WaitForPosition(file, expected); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; } -static void AssertPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertPosition(const FileInterface& file, int64_t expected) { int64_t pos = -1; ABORT_NOT_OK(file.Tell(&pos)); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h index ae62a7ff1e2b3..ad86ee87c9fda 100644 --- a/cpp/src/arrow/util/bit-stream-utils.h +++ b/cpp/src/arrow/util/bit-stream-utils.h @@ -397,7 +397,8 @@ inline bool BitReader::GetVlqInt(int32_t* v) { } inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u = (v << 1) ^ (v >> 31); + // Note negative left shift is undefined + uint32_t u = (static_cast(v) << 1) ^ (v >> 31); return PutVlqInt(u); } diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5f181e9b7b14c..b12e2ecf9eef9 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -756,7 +756,9 @@ static void TestZigZag(int32_t v) { TEST(BitStreamUtil, ZigZag) { TestZigZag(0); TestZigZag(1); + TestZigZag(1234); TestZigZag(-1); + TestZigZag(-1234); TestZigZag(std::numeric_limits::max()); TestZigZag(-std::numeric_limits::max()); } diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 94c270280ea3c..5925d98d9d8d5 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -417,8 +417,8 @@ TEST(Decimal128Test, TestFromBigEndian) { auto negated = -value; little_endian = negated.ToBytes(); std::reverse(little_endian.begin(), little_endian.end()); - // Convert all of the bytes since we have to include the sign bit - ASSERT_OK(Decimal128::FromBigEndian(little_endian.data(), 16, &out)); + // The sign bit is looked up in the MSB + ASSERT_OK(Decimal128::FromBigEndian(little_endian.data() + 15 - ii, ii + 1, &out)); ASSERT_EQ(negated, out); // Take the complement and convert to big endian diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index f6e110561b275..c980e2a9e773c 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -29,11 +29,15 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { +using internal::SafeLeftShift; +using internal::SafeSignedAdd; + static const Decimal128 ScaleMultipliers[] = { Decimal128(0LL), Decimal128(10LL), @@ -405,7 +409,7 @@ Decimal128& Decimal128::Negate() { low_bits_ = ~low_bits_ + 1; high_bits_ = ~high_bits_; if (low_bits_ == 0) { - ++high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, 1); } return *this; } @@ -414,9 +418,9 @@ Decimal128& Decimal128::Abs() { return *this < 0 ? Negate() : *this; } Decimal128& Decimal128::operator+=(const Decimal128& right) { const uint64_t sum = low_bits_ + right.low_bits_; - high_bits_ += right.high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, right.high_bits_); if (sum < low_bits_) { - ++high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, 1); } low_bits_ = sum; return *this; @@ -454,7 +458,7 @@ Decimal128& Decimal128::operator&=(const Decimal128& right) { Decimal128& Decimal128::operator<<=(uint32_t bits) { if (bits != 0) { if (bits < 64) { - high_bits_ <<= bits; + high_bits_ = SafeLeftShift(high_bits_, bits); high_bits_ |= (low_bits_ >> (64 - bits)); low_bits_ <<= bits; } else if (bits < 128) { @@ -925,7 +929,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 } else { high = -1 * (is_negative && length < kMaxDecimalBytes); // Shift left enough bits to make room for the incoming int64_t - high <<= high_bits_offset * CHAR_BIT; + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); // Preserve the upper bits by inplace OR-ing the int64_t high |= high_bits; } @@ -943,7 +947,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 // Sign extend the low bits if necessary low = -1 * (is_negative && length < 8); // Shift left enough bits to make room for the incoming int64_t - low <<= low_bits_offset * CHAR_BIT; + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); // Preserve the upper bits by inplace OR-ing the int64_t low |= low_bits; } diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 66d389e5f40cf..d3ae09f75cfa6 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -19,6 +19,7 @@ #define ARROW_UTIL_INT_UTIL_H #include +#include #include "arrow/util/visibility.h" @@ -67,6 +68,21 @@ template ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, const int32_t* transpose_map); +/// Signed addition with well-defined behaviour on overflow (as unsigned) +template +SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) + + static_cast(v)); +} + +/// Signed left shift with well-defined behaviour on negative numbers or overflow +template +SignedInt SafeLeftShift(SignedInt u, Shift shift) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) << shift); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index f4c58f4030afd..ab258252695ab 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -113,6 +113,15 @@ #endif #endif // !defined(MANUALLY_ALIGNED_STRUCT) +// ---------------------------------------------------------------------- +// Convenience macro disabling a particular UBSan check in a function + +#if defined(__clang__) +#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature))) +#else +#define ARROW_DISABLE_UBSAN(feature) +#endif + // ---------------------------------------------------------------------- // From googletest // (also in parquet-cpp) diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 46d0f7c322b46..23e7061ac8738 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -335,7 +335,10 @@ class StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { return false; } - *out = static_cast(-static_cast(unsigned_value)); + // To avoid both compiler warnings (with unsigned negation) + // and undefined behaviour (with signed negation overflow), + // use the expanded formula for 2's complement negation. + *out = static_cast(~unsigned_value + 1); } else { if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { return false; diff --git a/cpp/src/arrow/util/thread-pool-test.cc b/cpp/src/arrow/util/thread-pool-test.cc index 22a8db21fd280..c0deb20ccdde1 100644 --- a/cpp/src/arrow/util/thread-pool-test.cc +++ b/cpp/src/arrow/util/thread-pool-test.cc @@ -298,7 +298,8 @@ TEST_F(TestThreadPool, Submit) { // Test fork safety on Unix -#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)) +#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ + defined(THREAD_SANITIZER)) TEST_F(TestThreadPool, ForkSafety) { pid_t child_pid; int child_status; diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index b5905fddff489..58c703f7fe068 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -29,6 +29,7 @@ #include "arrow/api.h" #include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" @@ -76,6 +77,8 @@ namespace parquet { namespace arrow { using ::arrow::BitUtil::BytesForBits; +using ::arrow::BitUtil::FromBigEndian; +using ::arrow::internal::SafeLeftShift; template using ArrayType = typename ::arrow::TypeTraits::ArrayType; @@ -1098,8 +1101,6 @@ struct TransferFunctor< }; static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop) { - using ::arrow::BitUtil::FromBigEndian; - const int32_t length = stop - start; DCHECK_GE(length, 0); @@ -1155,37 +1156,54 @@ static constexpr int32_t kMaxDecimalBytes = 16; /// \brief Convert a sequence of big-endian bytes to one int64_t (high bits) and one /// uint64_t (low bits). -static void BytesToIntegerPair(const uint8_t* bytes, - const int32_t total_number_of_bytes_used, int64_t* high, - uint64_t* low) { - DCHECK_GE(total_number_of_bytes_used, kMinDecimalBytes); - DCHECK_LE(total_number_of_bytes_used, kMaxDecimalBytes); - - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. - const bool is_negative = static_cast(bytes[0]) < 0; +static void BytesToIntegerPair(const uint8_t* bytes, const int32_t length, + int64_t* out_high, uint64_t* out_low) { + DCHECK_GE(length, kMinDecimalBytes); + DCHECK_LE(length, kMaxDecimalBytes); - /// Sign extend the low bits if necessary - *low = UINT64_MAX * (is_negative && total_number_of_bytes_used < 8); - *high = -1 * (is_negative && total_number_of_bytes_used < kMaxDecimalBytes); + // XXX This code is copied from Decimal::FromBigEndian - /// Stop byte of the high bytes - const int32_t high_bits_offset = std::max(0, total_number_of_bytes_used - 8); + int64_t high, low; + + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. + const bool is_negative = static_cast(bytes[0]) < 0; - /// Shift left enough bits to make room for the incoming int64_t - *high <<= high_bits_offset * CHAR_BIT; + // 1. Extract the high bytes + // Stop byte of the high bytes + const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = BytesToInteger(bytes, 0, high_bits_offset); - /// Preserve the upper bits by inplace OR-ing the int64_t - *high |= BytesToInteger(bytes, 0, high_bits_offset); + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } - /// Stop byte of the low bytes - const int32_t low_bits_offset = std::min(total_number_of_bytes_used, 8); + // 2. Extract the low bytes + // Stop byte of the low bytes + const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = BytesToInteger(bytes, high_bits_offset, length); - /// Shift left enough bits to make room for the incoming uint64_t - *low <<= low_bits_offset * CHAR_BIT; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - /// Preserve the upper bits by inplace OR-ing the uint64_t - *low |= BytesToInteger(bytes, high_bits_offset, total_number_of_bytes_used); + *out_high = high; + *out_low = static_cast(low); } static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 0078051b49735..a66fc8d1b080c 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -155,11 +155,13 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { DCHECK(fpp > 0.0 && fpp < 1.0); const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); - uint32_t num_bits = static_cast(m); + uint32_t num_bits; // Handle overflow. if (m < 0 || m > kMaximumBloomFilterBytes << 3) { num_bits = static_cast(kMaximumBloomFilterBytes << 3); + } else { + num_bits = static_cast(m); } // Round up to lower bound diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc index 60f2be2362510..0475ca591de02 100644 --- a/cpp/src/parquet/column_reader-test.cc +++ b/cpp/src/parquet/column_reader-test.cc @@ -102,7 +102,7 @@ class TestPrimitiveReader : public ::testing::Test { &vresult[0] + total_values_read, &values_read)); total_values_read += static_cast(values_read); batch_actual += batch; - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while (batch > 0); ASSERT_EQ(num_levels_, batch_actual); @@ -147,7 +147,7 @@ class TestPrimitiveReader : public ::testing::Test { total_values_read += batch - static_cast(null_count); batch_actual += batch; levels_actual += static_cast(levels_read); - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while ((batch > 0) || (levels_read > 0)); ASSERT_EQ(num_levels_, levels_actual); diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h index e2dfc2380ddcf..8fbfb402a7fb1 100644 --- a/cpp/src/parquet/encoding-internal.h +++ b/cpp/src/parquet/encoding-internal.h @@ -83,7 +83,10 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, if (data_size < bytes_to_decode) { ParquetException::EofException(); } - memcpy(out, data, bytes_to_decode); + // If bytes_to_decode == 0, data could be null + if (bytes_to_decode > 0) { + memcpy(out, data, bytes_to_decode); + } return bytes_to_decode; } @@ -382,7 +385,7 @@ template inline void DictionaryDecoder::SetDict(Decoder* dictionary) { int num_dictionary_values = dictionary->values_left(); dictionary_.Resize(num_dictionary_values); - dictionary->Decode(&dictionary_[0], num_dictionary_values); + dictionary->Decode(dictionary_.data(), num_dictionary_values); } template <> diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 1812f5547abc2..2bc51e7dc7902 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -160,7 +160,8 @@ struct ByteArray { }; inline bool operator==(const ByteArray& left, const ByteArray& right) { - return left.len == right.len && 0 == std::memcmp(left.ptr, right.ptr, left.len); + return left.len == right.len && + (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); } inline bool operator!=(const ByteArray& left, const ByteArray& right) { diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc index 6251f1c85c085..b3f83bdfdfd32 100644 --- a/cpp/src/parquet/util/memory.cc +++ b/cpp/src/parquet/util/memory.cc @@ -233,8 +233,11 @@ void InMemoryOutputStream::Write(const uint8_t* data, int64_t length) { PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity)); capacity_ = new_capacity; } - memcpy(Head(), data, length); - size_ += length; + // If length == 0, data may be null + if (length > 0) { + memcpy(Head(), data, length); + size_ += length; + } } int64_t InMemoryOutputStream::Tell() { return size_; } diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h index 8677e6b9dacbc..d63ed84dd7ead 100644 --- a/cpp/src/parquet/util/memory.h +++ b/cpp/src/parquet/util/memory.h @@ -66,6 +66,7 @@ class PARQUET_EXPORT Vector { void Swap(Vector& v); inline T& operator[](int64_t i) const { return data_[i]; } + T* data() { return data_; } const T* data() const { return data_; } private: