Skip to content

Commit

Permalink
[Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (apache#4…
Browse files Browse the repository at this point in the history
…0857)

## Proposed changes

### Issue
```
/var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:9: runtime error: reference binding to null pointer of type 'doris::StringRef'
    #0 0x55ee63eb0418 in std::vector<doris::StringRef, std::allocator<doris::StringRef>>::operator[](unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:2
    #1 0x55ee63eb0418 in doris::Status doris::vectorized::OrcReader::_decode_string_non_dict_encoded_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn> const&, orc::TypeKind const&, orc::EncodedStringVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1172:39
    apache#2 0x55ee63ea2685 in doris::Status doris::vectorized::OrcReader::_decode_string_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn> const&, orc::TypeKind const&, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1124:16
    apache#3 0x55ee63e97e7a in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&, std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1365:16
    apache#4 0x55ee63b0e450 in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::immutable_ptr<doris::vectorized::IColumn>&, std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5
    apache#5 0x55ee63e99622 in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&, std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1410:9
    apache#6 0x55ee63b0e450 in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column<false>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> const&, COW<doris::vectorized::IColumn>::immutable_ptr<doris::vectorized::IColumn>&, std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5
    apache#7 0x55ee63ad4f86 in doris::vectorized::OrcReader::get_next_block_impl(doris::vectorized::Block*, unsigned long*, bool*) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1714:13
    apache#8 0x55ee63ad093b in doris::vectorized::OrcReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1547:5
```
### Solution
[Fix] (orc-reader) Fix StringRef nullptr data in orc-reader. When string
is empty in orc row batch, the data can point anything, maybe nullptr,
StringRef has undefined behavior when data is nullptr.

Related with apache#37845.
  • Loading branch information
kaka11chen authored Sep 18, 2024
1 parent a3295d7 commit d5133be
Showing 1 changed file with 28 additions and 14 deletions.
42 changes: 28 additions & 14 deletions be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1140,8 +1140,9 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
if (cvb->hasNulls) {
for (int i = 0; i < num_values; ++i) {
if (cvb->notNull[i]) {
string_values.emplace_back(cvb->data[i],
trim_right(cvb->data[i], cvb->length[i]));
size_t length = trim_right(cvb->data[i], cvb->length[i]);
string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(),
length);
} else {
// Orc doesn't fill null values in new batch, but the former batch has been release.
// Other types like int/long/timestamp... are flat types without pointer in them,
Expand All @@ -1151,21 +1152,26 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
}
} else {
for (int i = 0; i < num_values; ++i) {
string_values.emplace_back(cvb->data[i], trim_right(cvb->data[i], cvb->length[i]));
size_t length = trim_right(cvb->data[i], cvb->length[i]);
string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(),
length);
}
}
} else {
if (cvb->hasNulls) {
for (int i = 0; i < num_values; ++i) {
if (cvb->notNull[i]) {
string_values.emplace_back(cvb->data[i], cvb->length[i]);
string_values.emplace_back(
(cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(),
cvb->length[i]);
} else {
string_values.emplace_back(empty_string.data(), 0);
}
}
} else {
for (int i = 0; i < num_values; ++i) {
string_values.emplace_back(cvb->data[i], cvb->length[i]);
string_values.emplace_back(
(cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(), cvb->length[i]);
}
}
}
Expand Down Expand Up @@ -1204,7 +1210,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
} else {
// Orc doesn't fill null values in new batch, but the former batch has been release.
// Other types like int/long/timestamp... are flat types without pointer in them,
Expand All @@ -1227,7 +1234,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
}
}
} else {
Expand All @@ -1246,7 +1254,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
} else {
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
}
Expand All @@ -1265,7 +1274,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
}
}
}
Expand Down Expand Up @@ -2068,7 +2078,7 @@ Status OrcReader::on_string_dicts_loaded(
char* val_ptr;
int64_t length;
dict->getValueByIndex(i, val_ptr, length);
StringRef dict_value(val_ptr, length);
StringRef dict_value((length > 0) ? val_ptr : "", length);
if (length > max_value_length) {
max_value_length = length;
}
Expand Down Expand Up @@ -2328,7 +2338,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
} else {
// Orc doesn't fill null values in new batch, but the former batch has been release.
// Other types like int/long/timestamp... are flat types without pointer in them,
Expand All @@ -2346,7 +2357,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
}
}
} else {
Expand All @@ -2361,7 +2373,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
} else {
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
}
Expand All @@ -2375,7 +2388,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
string_values.emplace_back(val_ptr, length);
string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW,
length);
}
}
}
Expand Down

0 comments on commit d5133be

Please sign in to comment.