From 658ecf261afa0f9b42a4b478f4cfb6316ae385e7 Mon Sep 17 00:00:00 2001 From: rickif Date: Wed, 10 May 2023 16:06:46 +0800 Subject: [PATCH] [BugFix] overflow in parsing parquet int96 timestamp (#22356) The parquet format would use int96 to store big datetime like `9999-12-31 23:59:59`, which leads to overflow when it is cast to int64. The parquet reader provides an option to set the unit of int96 timestamp in https://github.com/apache/arrow/pull/10461. This PR adds a config `parquet_coerce_int96_timestamp_unit` for BE to set the unit of reading parquet int96 timestamp. With the default value `MICRO`, the maximum datetime value `9999-12-31 23:59:59.999999` in MySQL could be correctly handled. --- be/src/exec/vectorized/parquet_reader.cpp | 14 +++++++- .../parquet_data/int96_timestamp.parquet | Bin 0 -> 1629 bytes .../exec/vectorized/parquet_scanner_test.cpp | 34 +++++++++++++++--- 3 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 be/test/exec/test_data/parquet_data/int96_timestamp.parquet diff --git a/be/src/exec/vectorized/parquet_reader.cpp b/be/src/exec/vectorized/parquet_reader.cpp index 21faf019b96a5f..4668a438c1274f 100644 --- a/be/src/exec/vectorized/parquet_reader.cpp +++ b/be/src/exec/vectorized/parquet_reader.cpp @@ -58,9 +58,21 @@ Status ParquetReaderWrap::next_selected_row_group() { Status ParquetReaderWrap::init_parquet_reader(const std::vector& tuple_slot_descs, const std::string& timezone) { try { + parquet::ArrowReaderProperties arrow_reader_properties; + /* + * timestamp unit to use for INT96-encoded timestamps in parquet. + * SECOND, MICRO, MILLI, NANO + * We use MICRO second as the unit to parse int96 timestamp, which is the precision of DATETIME/TIMESTAMP in MySQL. + * https://dev.mysql.com/doc/refman/8.0/en/datetime.html + * A DATETIME or TIMESTAMP value can include a trailing fractional seconds part in up to microseconds (6 digits) precision + */ + arrow_reader_properties.set_coerce_int96_timestamp_unit(arrow::TimeUnit::MICRO); + // new file reader for parquet file auto st = parquet::arrow::FileReader::Make(arrow::default_memory_pool(), - parquet::ParquetFileReader::Open(_parquet, _properties), &_reader); + parquet::ParquetFileReader::Open(_parquet, _properties), + arrow_reader_properties, &_reader); + if (!st.ok()) { LOG(WARNING) << "failed to create parquet file reader, errmsg=" << st.ToString(); return Status::InternalError("Failed to create file reader"); diff --git a/be/test/exec/test_data/parquet_data/int96_timestamp.parquet b/be/test/exec/test_data/parquet_data/int96_timestamp.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4da4bdc6e732c65cb3e786946cd9bb5283712c39 GIT binary patch literal 1629 zcmbVN&2HO95MC*&kODacr7HnC2t;F`fb58rm{b9BShAK(#ZqkA7Adub%0E+-Ns8f* zD8=xtKrcP^(C6qA^w?v2>ofEriq0-&!E##^$N=Qd&J4f#c4ihl#R|uim?qOcfu-~m zp~}}k{rdfHMjN5VUuOvY{QTR`K1Jxy@EKwd^OV_ZkT|Bylxql~kLyKvmkNbP#r+=( zHKtq@N~m0|JTVvE)Y20>aebR9?v?hwEA0JH*w^+Sf1U#r30#RGpEJd3LqL^sY|d<7 z2NP3%R@{X^d!=fvTKn)EB!6lDF>v?Io_0l+Z)(jlq=Wv ztm&>Tr!`I1AxX!kfom?ToI&b`>#bv9%xyCv%zQi1q0%~p->`N7;cnoNZ!a&p2Vbza z_sHzBAPl!XB1k)Y{^5PV?5?@AIR8(L6fS2H*?O{wC|l1pB0X9Yvf)~fqD4l~mUa}o zi-1@;?yz~-WN*Q`Nq^sY*g6#0TU2?173Hn^Dj%Q9ALZvnT8@MX$LjFgRePAh6yAjX@K$u_|1TwE@Nh5+=TseJ`E# zMs1BBE{)?+Vgv*Ctt%S#x*zEfE6*2>9Q0>~Z+WttIkGRz4E2)4q?WphPbBP`EoEi~ zXJEtGX1_Eg`Gx$Y9LrMwz116K1ZSXe7V(eyPHd@Ms_}X5iMGv|j#bVhu|_b84T0%vHVaR0mp-+wnzhs`_}=UKfj^jDc}Yk0h=d8_Y%eQCP+ zIgOs@ntpd}lXFY`%vQT=)9PB8JzU>F*Fo7P}(ag+|Eb|wc+($CEd4X-SJv^IHf zX%I|FUY*Pb>F~VYAv3p`y336Qt3&4dn9L{08JWIZj^5BAmgo@Y^GqEiPJ~{pu1J5N zPa(`5*v1$xPLHg)&=Cjq^=nkAhkEobwUYxg3=imqt#2-5J-@2e8u=^y{t-f_@MHNK DcC_Qv literal 0 HcmV?d00001 diff --git a/be/test/exec/vectorized/parquet_scanner_test.cpp b/be/test/exec/vectorized/parquet_scanner_test.cpp index b78bc6fff760af..86efa05c824ff7 100644 --- a/be/test/exec/vectorized/parquet_scanner_test.cpp +++ b/be/test/exec/vectorized/parquet_scanner_test.cpp @@ -287,9 +287,9 @@ class ParquetScannerTest : public ::testing::Test { } template - ChunkPtr test_json_column(const std::vector& columns_from_file, - const std::unordered_map& dst_slot_exprs, - std::string specific_file) { + ChunkPtr get_chunk(const std::vector& columns_from_file, + const std::unordered_map& dst_slot_exprs, std::string specific_file, + size_t expected_row) { std::vector file_names{specific_file}; std::vector column_names = columns_from_file; @@ -312,7 +312,7 @@ class ParquetScannerTest : public ::testing::Test { } result = chunk; }; - validate(scanner, 36865, check); + validate(scanner, expected_rows, check); return result; } @@ -523,7 +523,7 @@ TEST_F(ParquetScannerTest, test_to_json) { std::vector column_names{column_name}; std::cerr << "test " << column_name << std::endl; - ChunkPtr chunk = test_json_column(column_names, slot_map, parquet_file_name); + ChunkPtr chunk = get_chunk(column_names, slot_map, parquet_file_name, 36865); ASSERT_EQ(1, chunk->num_columns()); auto col = chunk->columns()[0]; @@ -568,4 +568,28 @@ TEST_F(ParquetScannerTest, test_arrow_null) { validate(scanner, 3, check); } +TEST_F(ParquetScannerTest, int96_timestamp) { + const std::string parquet_file_name = test_exec_dir + "/test_data/parquet_data/int96_timestamp.parquet"; + std::vector>> test_cases = { + {"col_datetime", {"9999-12-31 23:59:59", "2006-01-02 15:04:05"}}}; + + std::vector columns_from_path; + std::vector path_values; + std::unordered_map slot_map; + + for (auto& [column_name, expected] : test_cases) { + std::vector column_names{column_name}; + + ChunkPtr chunk = get_chunk(column_names, slot_map, parquet_file_name, 2); + ASSERT_EQ(1, chunk->num_columns()); + + auto col = chunk->columns()[0]; + for (int i = 0; i < col->size(); i++) { + std::string result = col->debug_item(i); + std::string expect = expected[i]; + EXPECT_EQ(expect, result); + } + } +} + } // namespace starrocks::vectorized