From eea4a54f66a36137f4204876a68e6d5ed913cbc5 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 18 Oct 2022 22:27:15 +0200 Subject: [PATCH] ARROW-17524: [C++] Correction for fields included when reading an ORC table (#13962) I think there is a bug in the ORC reader : when we specify the fields indexes that we want to keep, it does not work correctly. Looking at the code, it seems to be because we do "includeTypes" in lieue of "include" when setting the ORC options. It can be problematic when we want to import an ORC table containing Union types as it will do an error at the import, even if we try not to import these specific fields. The definitions of the corresponding ORC methods are here : https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L185-L191 and https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L201-L207 Lead-authored-by: LouisClt Co-authored-by: Antoine Pitrou Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- c_glib/test/test-orc-file-reader.rb | 10 +++--- cpp/src/arrow/adapters/orc/adapter.cc | 2 +- cpp/src/arrow/adapters/orc/adapter_test.cc | 40 ++++++++++++++++++++-- ruby/red-arrow/test/test-orc.rb | 4 +-- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb index 38900cf12f3d0..6626c67c3abe4 100644 --- a/c_glib/test/test-orc-file-reader.rb +++ b/c_glib/test/test-orc-file-reader.rb @@ -185,8 +185,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - assert_equal(build_table("boolean1" => build_boolean_array([false, true]), - "short1" => build_int16_array([1024, 2048])), + assert_equal(build_table("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripes) end end @@ -200,10 +200,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - boolean1 = build_boolean_array([false, true]) - short1 = build_int16_array([1024, 2048]) - assert_equal(build_record_batch("boolean1" => boolean1, - "short1" => short1), + assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripe(0)) end end diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 5af5ebccc8470..18f88bc6dfb91 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -411,7 +411,7 @@ class ORCFileReader::Impl { ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } - opts->includeTypes(include_indices_list); + opts->include(include_indices_list); return Status::OK(); } diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 5c234cc97c4a0..afc4bdb1d3b24 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -226,7 +226,8 @@ std::shared_ptr GenerateRandomTable(const std::shared_ptr& schema void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, const std::shared_ptr
& expected_output_table, - const int64_t max_size = kDefaultSmallMemStreamSize) { + const int64_t max_size = kDefaultSmallMemStreamSize, + std::vector* opt_selected_read_indices = nullptr) { EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(max_size)); auto write_options = adapters::orc::WriteOptions(); @@ -250,7 +251,11 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, ASSERT_EQ(reader->GetCompression(), write_options.compression); ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size); ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride); - EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read()); + EXPECT_OK_AND_ASSIGN(auto actual_output_table, + opt_selected_read_indices == nullptr + ? reader->Read() + : reader->Read(*opt_selected_read_indices)); + ASSERT_OK(actual_output_table->ValidateFull()); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } @@ -451,6 +456,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { std::shared_ptr
table = TableFromJSON(table_schema, {}); AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); } +TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { + std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); + std::shared_ptr schema_selected = + schema({field("int8", int8()), field("int32", int32())}); + std::shared_ptr
table_selected = TableFromJSON(schema_selected, {R"([])"}); + std::vector selected_indices = {1, 3}; + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, + &selected_indices); +} +TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { + std::vector selected_indices = {1, 7}; + random::RandomArrayGenerator rand(kRandomSeed); + std::shared_ptr local_schema = schema({ + field("bool", boolean()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("double", float64()), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary()), + }); + auto batch = rand.BatchOf(local_schema->fields(), 100); + std::shared_ptr
table = Table::Make(local_schema, batch->columns()); + EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices)); + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, + &selected_indices); +} + class TestORCWriterTrivialWithConversion : public ::testing::Test { public: TestORCWriterTrivialWithConversion() { diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb index b882da0a1b5e3..4670350a09dbc 100644 --- a/ruby/red-arrow/test/test-orc.rb +++ b/ruby/red-arrow/test/test-orc.rb @@ -164,8 +164,8 @@ def pp_values(values) ] end assert_equal([ - ["boolean1: bool", [pp_values([false, true])]], - ["short1: int16", [pp_values([1024, 2048])]], + ["byte1: int8", [pp_values([1, 100])]], + ["int1: int32", [pp_values([65536, 65536])]], ], dump) end