Skip to content

Commit

Permalink
ARROW-17524: [C++] Correction for fields included when reading an ORC…
Browse files Browse the repository at this point in the history
… table (#13962)

I think there is a bug in the ORC reader : when we specify the fields indexes that we want to keep, it does not work correctly. Looking at the code, it seems to be because we do "includeTypes" in lieue of "include" when setting the ORC options.
It can be problematic when we want to import an ORC table containing Union types as it will do an error at the import, even if we try not to import these specific fields.

The definitions of the corresponding ORC methods are here : 
https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L185-L191

and 
https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L201-L207

Lead-authored-by: LouisClt <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
3 people authored and kou committed Oct 20, 2022
1 parent ff80b30 commit eea4a54
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 11 deletions.
10 changes: 4 additions & 6 deletions c_glib/test/test-orc-file-reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ def all_columns
test("select fields") do
require_gi_bindings(3, 2, 6)
@reader.field_indices = [1, 3]
assert_equal(build_table("boolean1" => build_boolean_array([false, true]),
"short1" => build_int16_array([1024, 2048])),
assert_equal(build_table("byte1" => build_int8_array([1, 100]),
"int1" => build_int32_array([65536, 65536])),
@reader.read_stripes)
end
end
Expand All @@ -200,10 +200,8 @@ def all_columns
test("select fields") do
require_gi_bindings(3, 2, 6)
@reader.field_indices = [1, 3]
boolean1 = build_boolean_array([false, true])
short1 = build_int16_array([1024, 2048])
assert_equal(build_record_batch("boolean1" => boolean1,
"short1" => short1),
assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]),
"int1" => build_int32_array([65536, 65536])),
@reader.read_stripe(0))
end
end
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ class ORCFileReader::Impl {
ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
include_indices_list.push_back(*it);
}
opts->includeTypes(include_indices_list);
opts->include(include_indices_list);
return Status::OK();
}

Expand Down
40 changes: 38 additions & 2 deletions cpp/src/arrow/adapters/orc/adapter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ std::shared_ptr<Table> GenerateRandomTable(const std::shared_ptr<Schema>& schema

void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
const std::shared_ptr<Table>& expected_output_table,
const int64_t max_size = kDefaultSmallMemStreamSize) {
const int64_t max_size = kDefaultSmallMemStreamSize,
std::vector<int>* opt_selected_read_indices = nullptr) {
EXPECT_OK_AND_ASSIGN(auto buffer_output_stream,
io::BufferOutputStream::Create(max_size));
auto write_options = adapters::orc::WriteOptions();
Expand All @@ -250,7 +251,11 @@ void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
ASSERT_EQ(reader->GetCompression(), write_options.compression);
ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size);
ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride);
EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read());
EXPECT_OK_AND_ASSIGN(auto actual_output_table,
opt_selected_read_indices == nullptr
? reader->Read()
: reader->Read(*opt_selected_read_indices));
ASSERT_OK(actual_output_table->ValidateFull());
AssertTablesEqual(*expected_output_table, *actual_output_table, false, false);
}

Expand Down Expand Up @@ -451,6 +456,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) {
std::shared_ptr<Table> table = TableFromJSON(table_schema, {});
AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
}
TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) {
std::shared_ptr<Table> table = TableFromJSON(table_schema, {R"([])"});
std::shared_ptr<Schema> schema_selected =
schema({field("int8", int8()), field("int32", int32())});
std::shared_ptr<Table> table_selected = TableFromJSON(schema_selected, {R"([])"});
std::vector<int> selected_indices = {1, 3};
AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16,
&selected_indices);
}
TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) {
std::vector<int> selected_indices = {1, 7};
random::RandomArrayGenerator rand(kRandomSeed);
std::shared_ptr<Schema> local_schema = schema({
field("bool", boolean()),
field("int32", int32()),
field("int64", int64()),
field("float", float32()),
field("struct", struct_({field("a", utf8()), field("b", int64())})),
field("double", float64()),
field("date32", date32()),
field("ts3", timestamp(TimeUnit::NANO)),
field("string", utf8()),
field("binary", binary()),
});
auto batch = rand.BatchOf(local_schema->fields(), 100);
std::shared_ptr<Table> table = Table::Make(local_schema, batch->columns());
EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices));
AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize,
&selected_indices);
}

class TestORCWriterTrivialWithConversion : public ::testing::Test {
public:
TestORCWriterTrivialWithConversion() {
Expand Down
4 changes: 2 additions & 2 deletions ruby/red-arrow/test/test-orc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def pp_values(values)
]
end
assert_equal([
["boolean1: bool", [pp_values([false, true])]],
["short1: int16", [pp_values([1024, 2048])]],
["byte1: int8", [pp_values([1, 100])]],
["int1: int32", [pp_values([65536, 65536])]],
],
dump)
end
Expand Down

0 comments on commit eea4a54

Please sign in to comment.