From c1b0651f0efa6c000f296d6c2fb1187a66633c48 Mon Sep 17 00:00:00 2001 From: Letian Jiang Date: Tue, 20 Jun 2023 10:00:36 +0800 Subject: [PATCH 1/3] Fix arrow parquet exception handling Signed-off-by: Letian Jiang --- thirdparty/download-thirdparty.sh | 7 ++++++- .../arrow-5.0.0-fix-exception-handling.patch | 21 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 thirdparty/patches/arrow-5.0.0-fix-exception-handling.patch diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index ba1a0d7ccd7d1..398ec41dcdf4e 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -475,13 +475,18 @@ fi echo "Finished patching $SERDES_SOURCE" cd - -# patch arrows to use our built jemalloc +# patch arrows if [[ -d $TP_SOURCE_DIR/$ARROW_SOURCE ]] ; then cd $TP_SOURCE_DIR/$ARROW_SOURCE + # use our built jemalloc if [ ! -f $PATCHED_MARK ] && [ $ARROW_SOURCE = "arrow-apache-arrow-5.0.0" ] ; then patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-force-use-external-jemalloc.patch touch $PATCHED_MARK fi + if [ ! -f $PATCHED_MARK ] && [ $ARROW_SOURCE = "arrow-apache-arrow-5.0.0" ] ; then + patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-fix-exception-handling.patch + touch $PATCHED_MARK + fi cd - echo "Finished patching $ARROW_SOURCE" fi diff --git a/thirdparty/patches/arrow-5.0.0-fix-exception-handling.patch b/thirdparty/patches/arrow-5.0.0-fix-exception-handling.patch new file mode 100644 index 0000000000000..e8e78ed992f1d --- /dev/null +++ b/thirdparty/patches/arrow-5.0.0-fix-exception-handling.patch @@ -0,0 +1,21 @@ +diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc +index deac9586e..836f9e93c 100644 +--- a/cpp/src/parquet/file_writer.cc ++++ b/cpp/src/parquet/file_writer.cc +@@ -181,15 +181,13 @@ class RowGroupSerializer : public RowGroupWriter::Contents { + closed_ = true; + CheckRowsWritten(); + ++ auto column_writers = std::move(column_writers_); + for (size_t i = 0; i < column_writers_.size(); i++) { + if (column_writers_[i]) { + total_bytes_written_ += column_writers_[i]->Close(); +- column_writers_[i].reset(); + } + } + +- column_writers_.clear(); +- + // Ensures all columns have been written + metadata_->set_num_rows(num_rows_); + metadata_->Finish(total_bytes_written_, row_group_ordinal_); From 34b1edebbdda35169175d31c607b194243e690c3 Mon Sep 17 00:00:00 2001 From: Letian Jiang Date: Tue, 20 Jun 2023 10:01:54 +0800 Subject: [PATCH 2/3] remove release Signed-off-by: Letian Jiang --- be/src/formats/parquet/file_writer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/be/src/formats/parquet/file_writer.cpp b/be/src/formats/parquet/file_writer.cpp index bb0ecc89b0ae4..0b0a2bdd33187 100644 --- a/be/src/formats/parquet/file_writer.cpp +++ b/be/src/formats/parquet/file_writer.cpp @@ -377,11 +377,7 @@ Status SyncFileWriter::_flush_row_group() { _chunk_writer->close(); } catch (const ::parquet::ParquetStatusException& e) { _chunk_writer.reset(); - - // this is to avoid calling ParquetFileWriter.Close which incurs segfault _closed = true; - _writer.release(); - auto st = Status::IOError(fmt::format("{}: {}", "flush rowgroup error", e.what())); LOG(WARNING) << st; return st; From f2bf941112f02459b1c4f63077c26da5a95318f5 Mon Sep 17 00:00:00 2001 From: Letian Jiang Date: Tue, 20 Jun 2023 10:03:43 +0800 Subject: [PATCH 3/3] update Signed-off-by: Letian Jiang --- thirdparty/download-thirdparty.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index 398ec41dcdf4e..e83726150d9f7 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -483,6 +483,7 @@ if [[ -d $TP_SOURCE_DIR/$ARROW_SOURCE ]] ; then patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-force-use-external-jemalloc.patch touch $PATCHED_MARK fi + # fix arrow parquet exception handling if [ ! -f $PATCHED_MARK ] && [ $ARROW_SOURCE = "arrow-apache-arrow-5.0.0" ] ; then patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-fix-exception-handling.patch touch $PATCHED_MARK