From 1e5d6e5c71d67379a45182aa03a38e18a9ca9a49 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 18 Dec 2024 06:38:36 +0100 Subject: [PATCH] GH-40633: [C++][Python] Fix ORC crash when file contains unknown timezone (#45051) ### Rationale for this change If the timezone database is present on the system, but does not contain a timezone referenced in a ORC file, the ORC reader will crash with an uncaught C++ exception. This can happen for example on Ubuntu 24.04 where some timezone aliases have been removed from the main `tzdata` package to a `tzdata-legacy` package. If `tzdata-legacy` is not installed, trying to read a ORC file that references e.g. the "US/Pacific" timezone would crash. Here is a backtrace excerpt: ``` #12 0x00007f1a3ce23a55 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6 #13 0x00007f1a3ce39391 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6 #14 0x00007f1a3f4accc4 in orc::loadTZDB(std::basic_string, std::allocator > const&) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #15 0x00007f1a3f4ad392 in std::call_once(std::once_flag&, orc::LazyTimezone::getImpl() const::{lambda()#1}&&)::{lambda()#2}::_FUN() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #16 0x00007f1a4298bec3 in __pthread_once_slow (once_control=0xa5ca7c8, init_routine=0x7f1a3ce69420 <__once_proxy>) at ./nptl/pthread_once.c:116 #17 0x00007f1a3f4a9ad0 in orc::LazyTimezone::getEpoch() const () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #18 0x00007f1a3f4e76b1 in orc::TimestampColumnReader::TimestampColumnReader(orc::Type const&, orc::StripeStreams&, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #19 0x00007f1a3f4e84ad in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #20 0x00007f1a3f4e8dd7 in orc::StructColumnReader::StructColumnReader(orc::Type const&, orc::StripeStreams&, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #21 0x00007f1a3f4e8532 in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #22 0x00007f1a3f4925e9 in orc::RowReaderImpl::startNextStripe() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #23 0x00007f1a3f492c9d in orc::RowReaderImpl::next(orc::ColumnVectorBatch&) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 #24 0x00007f1a3e6b251f in arrow::adapters::orc::ORCFileReader::Impl::ReadBatch(orc::RowReaderOptions const&, std::shared_ptr const&, long) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 ``` ### What changes are included in this PR? Catch C++ exceptions when iterating ORC batches instead of letting them slip through. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40633 Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- cpp/src/arrow/adapters/orc/adapter.cc | 16 ++++---- python/pyarrow/tests/test_orc.py | 58 ++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index d16b6cfd2e97d..51cca497485ce 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -145,7 +145,10 @@ class OrcStripeReader : public RecordBatchReader { Status ReadNext(std::shared_ptr* out) override { std::unique_ptr batch; - ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_)); + std::unique_ptr builder; + + ORC_BEGIN_CATCH_NOT_OK + batch = row_reader_->createRowBatch(batch_size_); const liborc::Type& type = row_reader_->getSelectedType(); if (!row_reader_->next(*batch)) { @@ -153,10 +156,8 @@ class OrcStripeReader : public RecordBatchReader { return Status::OK(); } - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema_, pool_, batch->numElements)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -164,9 +165,9 @@ class OrcStripeReader : public RecordBatchReader { RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0, batch->numElements, builder->GetField(i))); } + ORC_END_CATCH_NOT_OK - ARROW_ASSIGN_OR_RAISE(*out, builder->Flush()); - return Status::OK(); + return builder->Flush().Value(out); } private: @@ -470,15 +471,13 @@ class ORCFileReader::Impl { int64_t nrows) { std::unique_ptr row_reader; std::unique_ptr batch; + std::unique_ptr builder; ORC_BEGIN_CATCH_NOT_OK row_reader = reader_->createRowReader(opts); batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch)); - ORC_END_CATCH_NOT_OK - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -489,6 +488,7 @@ class ORCFileReader::Impl { batch->numElements, builder->GetField(i))); } } + ORC_END_CATCH_NOT_OK return builder->Flush(); } diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 1b467d523304c..b0f9e813b103d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -15,9 +15,14 @@ # specific language governing permissions and limitations # under the License. -import pytest import decimal import datetime +from pathlib import Path +import shutil +import subprocess +import sys + +import pytest import pyarrow as pa from pyarrow import fs @@ -140,6 +145,57 @@ def test_example_using_json(filename, datadir): check_example_file(path, table, need_fix=True) +def test_timezone_database_absent(datadir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone database is not found. + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + os.environ['TZDIR'] = '/tmp/non_existent' + + from pyarrow import orc + try: + orc_file = orc.ORCFile({str(path)!r}) + orc_file.read() + except Exception as e: + assert "time zone database" in str(e).lower(), e + else: + assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + +def test_timezone_absent(datadir, tmpdir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone database is present but the timezone + # is not found (GH-40633). + source_tzdir = Path('/usr/share/zoneinfo') + if not source_tzdir.exists(): + pytest.skip(f"Test needs timezone database in {source_tzdir}") + tzdir = Path(tmpdir / 'zoneinfo') + try: + shutil.copytree(source_tzdir, tzdir, symlinks=True) + except OSError as e: + pytest.skip(f"Failed to copy timezone database: {e}") + (tzdir / 'US' / 'Pacific').unlink(missing_ok=True) + + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + os.environ['TZDIR'] = {str(tzdir)!r} + + from pyarrow import orc + orc_file = orc.ORCFile({str(path)!r}) + try: + orc_file.read() + except Exception as e: + assert "zoneinfo/US/Pacific" in str(e), e + else: + assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + def test_orcfile_empty(datadir): from pyarrow import orc