From e40275ae44ae7aa7424aa1672305a96954cb39e1 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Fri, 13 Sep 2024 10:52:00 +0800 Subject: [PATCH] [Fix](parquet-reader) Fix parquet reader crash in set_dict(). (#40643) ## Proposed changes ### Issue ``` *** is nereids: 1 *** tablet id: 4 Abort at 1725864966 (unix time) try "date -d @1725864966" if you are using GNU date *** *** Set a breakpoint in static void __GI_abort() to debug *** PC: @ 0x7f007fb4090a04 *** SIGSEGV (address not mapped to object 0xa0fa868a41d6) received by PID 404737 (TID 274135 OR 0x7ece29df700) from PID 1755584205; stack trace: *** #0 __GI_raise #1 __GI_abort #2 sig_handler #3 _sigaction #4 JVM_handle_linux_signal #5 _sigaction #6 doris::vectorized::ByteArrayDictDecoder::set_dict(std::unique_ptr> &&, int, unsigned long) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp:41 #7 doris::vectorized::ColumnChunkReader::_decode_dict_page() at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:258 #8 doris::vectorized::ColumnChunkReader::next_page() at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:105 #9 doris::vectorized::ParquetColumnReader::_read_column_data(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:508 #10 doris::vectorized::ScalarColumnReader::_next_value(doris::vectorized::ICollumn*, unsigned long, unsigned long*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:699 #11 doris::vectorized::RowGroupReader::_read_column_data(doris::vectorized::Block*, std::__cxx11::basic_string, std::allocator> &, std::vector*, unsigned long, unsigned long*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:425 #12 doris::vectorized::RowGroupReader::get_next_block(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:311 #13 doris::vectorized::ParquetReader::get_next(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_reader.cpp:533 #14 doris::vectorized::VFileScanner::_get_next_reader_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:368 #15 doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:411 #16 doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:431 #17 doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:96 #18 doris::vectorized::ScannerScheduler::submit(doris::vectorized::ScannerContext*, std::shared_ptr) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/scanner_context.cpp:96 #19 doris::Thread::supervise_thread(void*) at /mnt/disk1/yy/git/enterprise-core/be/src/util/thread.cpp:499 #20 start_thread #21 clone in /lib64/libc.so.6 ``` ### Solution It is not known why the parquet dictionary page will be null in this case, causing a crash. This PR adds defensive code to prevent the crash. --- be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp | 3 +++ be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp index 7d9f708011c4e5..4be7cb8b667950 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp @@ -32,6 +32,9 @@ namespace doris::vectorized { Status ByteArrayDictDecoder::set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) { _dict = std::move(dict); + if (_dict == nullptr) { + return Status::Corruption("Wrong dictionary data for byte array type, dict is null."); + } _dict_items.reserve(num_values); uint32_t offset_cursor = 0; char* dict_item_address = reinterpret_cast(_dict.get()); diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 0bcc0bd5e73a40..6e7d3c7b99d8e0 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -107,6 +107,9 @@ class FixLengthDictDecoder final : public BaseDictDecoder { return Status::Corruption("Wrong dictionary data for fixed length type"); } _dict = std::move(dict); + if (_dict == nullptr) { + return Status::Corruption("Wrong dictionary data for byte array type, dict is null."); + } char* dict_item_address = reinterpret_cast(_dict.get()); _dict_items.resize(num_values); for (size_t i = 0; i < num_values; ++i) {