Skip to content

Commit

Permalink
apacheGH-40633: [C++][Python] Fix ORC crash when file contains unknow…
Browse files Browse the repository at this point in the history
…n timezone (apache#45051)

### Rationale for this change

If the timezone database is present on the system, but does not contain a timezone referenced in a ORC file, the ORC reader will crash with an uncaught C++ exception.

This can happen for example on Ubuntu 24.04 where some timezone aliases have been removed from the main `tzdata` package to a `tzdata-legacy` package. If `tzdata-legacy` is not installed, trying to read a ORC file that references e.g. the "US/Pacific" timezone would crash.

Here is a backtrace excerpt:
```
#12 0x00007f1a3ce23a55 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#13 0x00007f1a3ce39391 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#14 0x00007f1a3f4accc4 in orc::loadTZDB(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
#15 0x00007f1a3f4ad392 in std::call_once<orc::LazyTimezone::getImpl() const::{lambda()#1}>(std::once_flag&, orc::LazyTimezone::getImpl() const::{lambda()#1}&&)::{lambda()#2}::_FUN() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
#16 0x00007f1a4298bec3 in __pthread_once_slow (once_control=0xa5ca7c8, init_routine=0x7f1a3ce69420 <__once_proxy>) at ./nptl/pthread_once.c:116
#17 0x00007f1a3f4a9ad0 in orc::LazyTimezone::getEpoch() const ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
#18 0x00007f1a3f4e76b1 in orc::TimestampColumnReader::TimestampColumnReader(orc::Type const&, orc::StripeStreams&, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
#19 0x00007f1a3f4e84ad in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#20 0x00007f1a3f4e8dd7 in orc::StructColumnReader::StructColumnReader(orc::Type const&, orc::StripeStreams&, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#21 0x00007f1a3f4e8532 in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#22 0x00007f1a3f4925e9 in orc::RowReaderImpl::startNextStripe() ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#23 0x00007f1a3f492c9d in orc::RowReaderImpl::next(orc::ColumnVectorBatch&) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#24 0x00007f1a3e6b251f in arrow::adapters::orc::ORCFileReader::Impl::ReadBatch(orc::RowReaderOptions const&, std::shared_ptr<arrow::Schema> const&, long) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
```

### What changes are included in this PR?

Catch C++ exceptions when iterating ORC batches instead of letting them slip through.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: apache#40633

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
pitrou authored Dec 18, 2024
1 parent 5ec8b64 commit 1e5d6e5
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 9 deletions.
16 changes: 8 additions & 8 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,28 +145,29 @@ class OrcStripeReader : public RecordBatchReader {

Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
std::unique_ptr<liborc::ColumnVectorBatch> batch;
ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
std::unique_ptr<RecordBatchBuilder> builder;

ORC_BEGIN_CATCH_NOT_OK
batch = row_reader_->createRowBatch(batch_size_);

const liborc::Type& type = row_reader_->getSelectedType();
if (!row_reader_->next(*batch)) {
out->reset();
return Status::OK();
}

std::unique_ptr<RecordBatchBuilder> builder;
ARROW_ASSIGN_OR_RAISE(builder,
RecordBatchBuilder::Make(schema_, pool_, batch->numElements));

// The top-level type must be a struct to read into an arrow table
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);

for (int i = 0; i < builder->num_fields(); i++) {
RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
batch->numElements, builder->GetField(i)));
}
ORC_END_CATCH_NOT_OK

ARROW_ASSIGN_OR_RAISE(*out, builder->Flush());
return Status::OK();
return builder->Flush().Value(out);
}

private:
Expand Down Expand Up @@ -470,15 +471,13 @@ class ORCFileReader::Impl {
int64_t nrows) {
std::unique_ptr<liborc::RowReader> row_reader;
std::unique_ptr<liborc::ColumnVectorBatch> batch;
std::unique_ptr<RecordBatchBuilder> builder;

ORC_BEGIN_CATCH_NOT_OK
row_reader = reader_->createRowReader(opts);
batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
ORC_END_CATCH_NOT_OK

std::unique_ptr<RecordBatchBuilder> builder;
ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows));

// The top-level type must be a struct to read into an arrow table
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);

Expand All @@ -489,6 +488,7 @@ class ORCFileReader::Impl {
batch->numElements, builder->GetField(i)));
}
}
ORC_END_CATCH_NOT_OK

return builder->Flush();
}
Expand Down
58 changes: 57 additions & 1 deletion python/pyarrow/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
# specific language governing permissions and limitations
# under the License.

import pytest
import decimal
import datetime
from pathlib import Path
import shutil
import subprocess
import sys

import pytest

import pyarrow as pa
from pyarrow import fs
Expand Down Expand Up @@ -140,6 +145,57 @@ def test_example_using_json(filename, datadir):
check_example_file(path, table, need_fix=True)


def test_timezone_database_absent(datadir):
# Example file relies on the timezone "US/Pacific". It should gracefully
# fail, not crash, if the timezone database is not found.
path = datadir / 'TestOrcFile.testDate1900.orc'
code = f"""if 1:
import os
os.environ['TZDIR'] = '/tmp/non_existent'
from pyarrow import orc
try:
orc_file = orc.ORCFile({str(path)!r})
orc_file.read()
except Exception as e:
assert "time zone database" in str(e).lower(), e
else:
assert False, "Should have raised exception"
"""
subprocess.run([sys.executable, "-c", code], check=True)


def test_timezone_absent(datadir, tmpdir):
# Example file relies on the timezone "US/Pacific". It should gracefully
# fail, not crash, if the timezone database is present but the timezone
# is not found (GH-40633).
source_tzdir = Path('/usr/share/zoneinfo')
if not source_tzdir.exists():
pytest.skip(f"Test needs timezone database in {source_tzdir}")
tzdir = Path(tmpdir / 'zoneinfo')
try:
shutil.copytree(source_tzdir, tzdir, symlinks=True)
except OSError as e:
pytest.skip(f"Failed to copy timezone database: {e}")
(tzdir / 'US' / 'Pacific').unlink(missing_ok=True)

path = datadir / 'TestOrcFile.testDate1900.orc'
code = f"""if 1:
import os
os.environ['TZDIR'] = {str(tzdir)!r}
from pyarrow import orc
orc_file = orc.ORCFile({str(path)!r})
try:
orc_file.read()
except Exception as e:
assert "zoneinfo/US/Pacific" in str(e), e
else:
assert False, "Should have raised exception"
"""
subprocess.run([sys.executable, "-c", code], check=True)


def test_orcfile_empty(datadir):
from pyarrow import orc

Expand Down

0 comments on commit 1e5d6e5

Please sign in to comment.