diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 61adef26dab..2567b2579d7 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -535,9 +535,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, chunk.ts_clock_rate = to_clockrate(_timestamp_type.id()); } for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { - if (chunk.strm_len[k] > 0) { - chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; - } + chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; } } stripe_start_row += stripe_info->numberOfRows; diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index ca8aa00f80c..fa14a0a9690 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -738,3 +738,19 @@ def test_nanoseconds_overflow(): pyarrow_got = pa.orc.ORCFile(buffer).read() assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) + + +@pytest.mark.parametrize( + "data", [[None, ""], ["", None], [None, None], ["", ""]] +) +def test_empty_string_columns(data): + buffer = BytesIO() + + expected = cudf.DataFrame({"string": data}, dtype="str") + expected.to_orc(buffer) + + expected_pdf = pd.read_orc(buffer) + got_df = cudf.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df)