From 217d702c9cb546dc8c175defc8876fd473b5c8a0 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Sat, 20 Mar 2021 03:28:46 +0530
Subject: [PATCH] Fix ORC reader issue with reading empty string columns
 (#7656)

There was a [condition in reader where if the data size is zero](https://github.com/rapidsai/cudf/blob/8773a40f4c8ce63f56ed6eb67b4eaf959106939f/cpp/src/io/orc/reader_impl.cu#L538), then stream pointer was not getting updated.
But in case of `["", ""]` where it is a valid data with 0 size, it was reading it as `[null, null]`, so the condition has been removed which caused this issue.

I have also added test cases to validate.

closes #7620

Authors:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

Approvers:
  - Devavret Makkar (@devavret)
  - Vukasin Milovanovic (@vuule)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7656
---
 cpp/src/io/orc/reader_impl.cu      |  4 +---
 python/cudf/cudf/tests/test_orc.py | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 61adef26dab..2567b2579d7 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -535,9 +535,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
         }
         for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-          if (chunk.strm_len[k] > 0) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-          }
+          chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
         }
       }
       stripe_start_row += stripe_info->numberOfRows;
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ca8aa00f80c..fa14a0a9690 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -738,3 +738,19 @@ def test_nanoseconds_overflow():
 
     pyarrow_got = pa.orc.ORCFile(buffer).read()
     assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "data", [[None, ""], ["", None], [None, None], ["", ""]]
+)
+def test_empty_string_columns(data):
+    buffer = BytesIO()
+
+    expected = cudf.DataFrame({"string": data}, dtype="str")
+    expected.to_orc(buffer)
+
+    expected_pdf = pd.read_orc(buffer)
+    got_df = cudf.read_orc(buffer)
+
+    assert_eq(expected, got_df)
+    assert_eq(expected_pdf, got_df)