diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 547d53bd00e2be..7a820845ed0a04 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -279,13 +279,15 @@ Status OrcReader::init_reader( const VExprContextSPtrs& conjuncts, bool is_acid, const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { + const std::unordered_map* slot_id_to_filter_conjuncts, + const bool hive_use_column_names) { _column_names = column_names; _colname_to_value_range = colname_to_value_range; _lazy_read_ctx.conjuncts = conjuncts; _is_acid = is_acid; _tuple_descriptor = tuple_descriptor; _row_descriptor = row_descriptor; + _is_hive1_orc_or_use_idx = !hive_use_column_names; if (not_single_slot_filter_conjuncts != nullptr && !not_single_slot_filter_conjuncts->empty()) { _not_single_slot_filter_conjuncts.insert(_not_single_slot_filter_conjuncts.end(), not_single_slot_filter_conjuncts->begin(), @@ -337,10 +339,11 @@ Status OrcReader::_init_read_columns() { // In old version slot_name_to_schema_pos may not be set in _scan_params // TODO, should be removed in 2.2 or later - _is_hive1_orc = is_hive1_orc && _scan_params.__isset.slot_name_to_schema_pos; + _is_hive1_orc_or_use_idx = (is_hive1_orc || _is_hive1_orc_or_use_idx) && + _scan_params.__isset.slot_name_to_schema_pos; for (size_t i = 0; i < _column_names->size(); ++i) { auto& col_name = (*_column_names)[i]; - if (_is_hive1_orc) { + if (_is_hive1_orc_or_use_idx) { auto iter = _scan_params.slot_name_to_schema_pos.find(col_name); if (iter != _scan_params.slot_name_to_schema_pos.end()) { int pos = iter->second; @@ -375,9 +378,10 @@ Status OrcReader::_init_read_columns() { _read_cols_lower_case.emplace_back(col_name); // For hive engine, store the orc column name to schema column name map. // This is for Hive 1.x orc file with internal column name _col0, _col1... - if (_is_hive1_orc) { + if (_is_hive1_orc_or_use_idx) { _removed_acid_file_col_name_to_schema_col[orc_cols[pos]] = col_name; } + _col_name_to_file_col_name[col_name] = read_col; } } @@ -708,7 +712,7 @@ bool OrcReader::_init_search_argument( if (iter == colname_to_value_range->end()) { continue; } - auto type_it = type_map.find(col_name); + auto type_it = type_map.find(_col_name_to_file_col_name[col_name]); if (type_it == type_map.end()) { continue; } @@ -913,7 +917,7 @@ Status OrcReader::_init_select_types(const orc::Type& type, int idx) { std::string name; // For hive engine, translate the column name in orc file to schema column name. // This is for Hive 1.x which use internal column name _col0, _col1... - if (_is_hive1_orc) { + if (_is_hive1_orc_or_use_idx) { name = _removed_acid_file_col_name_to_schema_col[type.getFieldName(i)]; } else { name = get_field_name_lower_case(&type, i); diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 77eec261b0109e..c0b372dfcea5ee 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -139,14 +139,15 @@ class OrcReader : public GenericReader { const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true); ~OrcReader() override; - + //If you want to read the file by index instead of column name, set hive_use_column_names to false. Status init_reader( const std::vector* column_names, std::unordered_map* colname_to_value_range, const VExprContextSPtrs& conjuncts, bool is_acid, const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); + const std::unordered_map* slot_id_to_filter_conjuncts, + const bool hive_use_column_names = true); Status set_fill_columns( const std::unordered_map>& @@ -570,9 +571,11 @@ class OrcReader : public GenericReader { // This is used for Hive 1.x which use internal column name in Orc file. // _col0, _col1... std::unordered_map _removed_acid_file_col_name_to_schema_col; - // Flag for hive engine. True if the external table engine is Hive1.x with orc col name - // as _col1, col2, ... - bool _is_hive1_orc = false; + // Flag for hive engine. + // 1. True if the external table engine is Hive1.x with orc col name as _col1, col2, ... + // 2. If true, use indexes instead of column names when reading orc tables. + bool _is_hive1_orc_or_use_idx = false; + std::unordered_map _col_name_to_file_col_name; std::unordered_map _type_map; std::vector _col_orc_type; @@ -621,6 +624,8 @@ class OrcReader : public GenericReader { // resolve schema change std::unordered_map> _converters; //for iceberg table , when table column name != file column name + //TODO(CXY) : remove _table_col_to_file_col,because we hava _col_name_to_file_col_name, + // the two have the same effect. std::unordered_map _table_col_to_file_col; //support iceberg position delete . std::vector* _position_delete_ordered_rowids = nullptr; diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index f3b9f2ad55ca2a..57396c349ddcc4 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -300,12 +301,14 @@ Status ParquetReader::init_reader( const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts, - bool filter_groups) { + bool filter_groups, const bool hive_use_column_names) { _tuple_descriptor = tuple_descriptor; _row_descriptor = row_descriptor; _colname_to_slot_id = colname_to_slot_id; _not_single_slot_filter_conjuncts = not_single_slot_filter_conjuncts; _slot_id_to_filter_conjuncts = slot_id_to_filter_conjuncts; + _colname_to_value_range = colname_to_value_range; + _hive_use_column_names = hive_use_column_names; if (_file_metadata == nullptr) { return Status::InternalError("failed to init parquet reader, please open reader first"); } @@ -320,28 +323,59 @@ Status ParquetReader::init_reader( // e.g. table added a column after this parquet file was written. _column_names = &all_column_names; auto schema_desc = _file_metadata->schema(); - std::set required_columns(all_column_names.begin(), all_column_names.end()); - // Currently only used in iceberg, the columns are dropped but added back - std::set dropped_columns(missing_column_names.begin(), missing_column_names.end()); - // Make the order of read columns the same as physical order in parquet file - for (int i = 0; i < schema_desc.size(); ++i) { - auto name = schema_desc.get_column(i)->name; - // If the column in parquet file is included in all_column_names and not in missing_column_names, - // add it to _map_column, which means the reader should read the data of this column. - // Here to check against missing_column_names is for the 'Add a column back to the table - // with the same column name' case. (drop column a then add column a). - // Shouldn't read this column data in this case. - if (required_columns.find(name) != required_columns.end() && - dropped_columns.find(name) == dropped_columns.end()) { - required_columns.erase(name); - _read_columns.emplace_back(name); + if (_hive_use_column_names) { + std::set required_columns(all_column_names.begin(), all_column_names.end()); + // Currently only used in iceberg, the columns are dropped but added back + std::set dropped_columns(missing_column_names.begin(), + missing_column_names.end()); + // Make the order of read columns the same as physical order in parquet file + for (int i = 0; i < schema_desc.size(); ++i) { + auto name = schema_desc.get_column(i)->name; + // If the column in parquet file is included in all_column_names and not in missing_column_names, + // add it to _map_column, which means the reader should read the data of this column. + // Here to check against missing_column_names is for the 'Add a column back to the table + // with the same column name' case. (drop column a then add column a). + // Shouldn't read this column data in this case. + if (required_columns.find(name) != required_columns.end() && + dropped_columns.find(name) == dropped_columns.end()) { + required_columns.erase(name); + _read_columns.emplace_back(name); + } + } + for (const std::string& name : required_columns) { + _missing_cols.emplace_back(name); + } + } else { + std::unordered_map new_colname_to_value_range; + const auto& table_column_idxs = _scan_params.column_idxs; + std::map table_col_id_to_idx; + for (int i = 0; i < table_column_idxs.size(); i++) { + table_col_id_to_idx.insert({table_column_idxs[i], i}); } - } - for (const std::string& name : required_columns) { - _missing_cols.emplace_back(name); - } - _colname_to_value_range = colname_to_value_range; + for (auto [id, idx] : table_col_id_to_idx) { + if (id >= schema_desc.size()) { + _missing_cols.emplace_back(all_column_names[idx]); + } else { + auto& table_col = all_column_names[idx]; + auto file_col = schema_desc.get_column(id)->name; + _read_columns.emplace_back(file_col); + + if (table_col != file_col) { + _table_col_to_file_col[table_col] = file_col; + auto iter = _colname_to_value_range->find(table_col); + if (iter != _colname_to_value_range->end()) { + continue; + } + new_colname_to_value_range[file_col] = iter->second; + _colname_to_value_range->erase(iter->first); + } + } + } + for (auto it : new_colname_to_value_range) { + _colname_to_value_range->emplace(it.first, std::move(it.second)); + } + } // build column predicates for column lazy read _lazy_read_ctx.conjuncts = conjuncts; RETURN_IF_ERROR(_init_row_groups(filter_groups)); @@ -525,6 +559,16 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } + if (!_hive_use_column_names) { + for (auto i = 0; i < block->get_names().size(); i++) { + auto& col = block->get_by_position(i); + if (_table_col_to_file_col.contains(col.name)) { + col.name = _table_col_to_file_col[col.name]; + } + } + block->initialize_index_by_name(); + } + SCOPED_RAW_TIMER(&_statistics.column_read_time); Status batch_st = _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); @@ -535,6 +579,13 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) *eof = true; return Status::OK(); } + + if (!_hive_use_column_names) { + for (auto i = 0; i < block->columns(); i++) { + block->get_by_position(i).name = (*_column_names)[i]; + } + block->initialize_index_by_name(); + } if (!batch_st.ok()) { return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path, batch_st.to_string()); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 52700aafb7fd7e..3cc262e14e69cc 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -116,7 +116,7 @@ class ParquetReader : public GenericReader { const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts, - bool filter_groups = true); + bool filter_groups = true, const bool hive_use_column_names = true); Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; @@ -283,5 +283,6 @@ class ParquetReader : public GenericReader { const std::unordered_map* _colname_to_slot_id = nullptr; const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr; const std::unordered_map* _slot_id_to_filter_conjuncts = nullptr; + bool _hive_use_column_names = false; }; } // namespace doris::vectorized diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 45ec7fe4dc7101..1f7e2df0f3452b 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -862,12 +862,21 @@ Status VFileScanner::_get_next_reader() { RETURN_IF_ERROR(paimon_reader->init_row_filters(range)); _cur_reader = std::move(paimon_reader); } else { + bool hive_parquet_use_column_names = true; + + if (range.__isset.table_format_params && + range.table_format_params.table_format_type == "hive" && _state != nullptr) + [[likely]] { + hive_parquet_use_column_names = + _state->query_options().hive_parquet_use_column_names; + } + std::vector place_holder; init_status = parquet_reader->init_reader( _file_col_names, place_holder, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + &_slot_id_to_filter_conjuncts, true, hive_parquet_use_column_names); _cur_reader = std::move(parquet_reader); } need_to_get_parsed_schema = true; @@ -923,10 +932,18 @@ Status VFileScanner::_get_next_reader() { RETURN_IF_ERROR(paimon_reader->init_row_filters(range)); _cur_reader = std::move(paimon_reader); } else { + bool hive_orc_use_column_names = true; + + if (range.__isset.table_format_params && + range.table_format_params.table_format_type == "hive" && _state != nullptr) + [[likely]] { + hive_orc_use_column_names = _state->query_options().hive_orc_use_column_names; + } init_status = orc_reader->init_reader( &_file_col_names, _colname_to_value_range, _push_down_conjuncts, false, _real_tuple_desc, _default_val_row_desc.get(), - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts, + hive_orc_use_column_names); _cur_reader = std::move(orc_reader); } need_to_get_parsed_schema = true; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run64.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run64.hql new file mode 100644 index 00000000000000..744b83418db0d0 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run64.hql @@ -0,0 +1,29 @@ +use default; + +create table simulation_hive1_orc( + `a` boolean, + `b` int, + `c` string +)stored as orc +LOCATION '/user/doris/preinstalled_data/orc_table/simulation_hive1_orc'; +msck repair table simulation_hive1_orc; + +create table test_hive_rename_column_parquet( + `new_a` boolean, + `new_b` int, + `c` string, + `new_d` int, + `f` string +)stored as parquet +LOCATION '/user/doris/preinstalled_data/parquet_table/test_hive_rename_column_parquet'; +msck repair table test_hive_rename_column_parquet; + +create table test_hive_rename_column_orc( + `new_a` boolean, + `new_b` int, + `c` string, + `new_d` int, + `f` string +)stored as orc +LOCATION '/user/doris/preinstalled_data/orc_table/test_hive_rename_column_orc'; +msck repair table test_hive_rename_column_orc; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/simulation_hive1_orc/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/simulation_hive1_orc/000000_0 new file mode 100644 index 00000000000000..848dc3250eeb40 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/simulation_hive1_orc/000000_0 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0 new file mode 100644 index 00000000000000..398aed3001fb0d Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_1 new file mode 100644 index 00000000000000..e58535d6661964 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_1 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_2 new file mode 100644 index 00000000000000..84490d9f085582 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_2 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_3 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_3 new file mode 100644 index 00000000000000..2c54adff6f24ff Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_hive_rename_column_orc/000000_0_copy_3 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0 new file mode 100644 index 00000000000000..deea62bcfb37bc Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_1 new file mode 100644 index 00000000000000..45ae5dee1abbaa Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_1 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_2 new file mode 100644 index 00000000000000..e37fc5d2eb7568 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_2 differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_3 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_3 new file mode 100644 index 00000000000000..97bb0ab8475563 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_hive_rename_column_parquet/000000_0_copy_3 differ diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index 9822855aa72ef4..ea522fcfa28e95 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -434,6 +434,10 @@ private TScanRangeLocations splitToScanRange( transactionalHiveDesc.setDeleteDeltas(deleteDeltaDescs); tableFormatFileDesc.setTransactionalHiveParams(transactionalHiveDesc); rangeDesc.setTableFormatParams(tableFormatFileDesc); + } else if (fileSplit instanceof HiveSplit) { + TTableFormatFileDesc tableFormatFileDesc = new TTableFormatFileDesc(); + tableFormatFileDesc.setTableFormatType(TableFormatType.HIVE.value()); + rangeDesc.setTableFormatParams(tableFormatFileDesc); } setScanParams(rangeDesc, fileSplit); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 067ec3262516f5..b9c8a91bd47c34 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -556,6 +556,10 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE = "enable_pushdown_minmax_on_unique"; + public static final String HIVE_PARQUET_USE_COLUMN_NAMES = "hive_parquet_use_column_names"; + + public static final String HIVE_ORC_USE_COLUMN_NAMES = "hive_orc_use_column_names"; + public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return"; public static final String ENABLE_PUSHDOWN_STRING_MINMAX = "enable_pushdown_string_minmax"; @@ -1770,11 +1774,25 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) { public int createTablePartitionMaxNum = 10000; + @VariableMgr.VarAttr(name = HIVE_PARQUET_USE_COLUMN_NAMES, + description = {"默认情况下按名称访问 Parquet 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。", + "Access Parquet columns by name by default. Set this property to `false` to access columns " + + "by their ordinal position in the Hive table definition."}) + public boolean hiveParquetUseColumnNames = true; + + + @VariableMgr.VarAttr(name = HIVE_ORC_USE_COLUMN_NAMES, + description = {"默认情况下按名称访问 Orc 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。", + "Access Parquet columns by name by default. Set this property to `false` to access columns " + + "by their ordinal position in the Hive table definition."}) + public boolean hiveOrcUseColumnNames = true; + @VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN, description = {"在同时处理\r和\r\n作为CSV的行分隔符时,是否保留\r", "When processing both \\n and \\r\\n as CSV line separators, should \\r be retained?"}) public boolean keepCarriageReturn = false; + @VariableMgr.VarAttr(name = FORCE_JNI_SCANNER, description = {"强制使用jni方式读取外表", "Force the use of jni mode to read external table"}) private boolean forceJniScanner = false; @@ -3435,6 +3453,8 @@ public TQueryOptions toThrift() { tResult.setReadCsvEmptyLineAsNull(readCsvEmptyLineAsNull); tResult.setSerdeDialect(getSerdeDialect()); + tResult.setHiveOrcUseColumnNames(hiveOrcUseColumnNames); + tResult.setHiveParquetUseColumnNames(hiveParquetUseColumnNames); tResult.setKeepCarriageReturn(keepCarriageReturn); return tResult; } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index e2e25619abe159..41d113497d34ca 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -311,6 +311,12 @@ struct TQueryOptions { 119: optional bool keep_carriage_return = false; // \n,\r\n split line in CSV. 122: optional i32 runtime_bloom_filter_min_size = 1048576; + + //Access Parquet/ORC columns by name by default. Set this property to `false` to access columns + //by their ordinal position in the Hive table definition. + 123: optional bool hive_parquet_use_column_names = true; + 124: optional bool hive_orc_use_column_names = true; + // For cloud, to control if the content would be written into file cache 1000: optional bool disable_file_cache = false } diff --git a/regression-test/data/external_table_p0/hive/test_hive_rename_column_orc_parquet.out b/regression-test/data/external_table_p0/hive/test_hive_rename_column_orc_parquet.out new file mode 100644 index 00000000000000..fa260b9622133e --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_hive_rename_column_orc_parquet.out @@ -0,0 +1,435 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !hive1_orc_1_true -- +true 10 hello world +false 20 keep + +-- !hive1_orc_2_true -- +10 hello world true +20 keep false + +-- !hive1_orc_3_true -- +hello world true +keep false + +-- !hive1_orc_4_true -- +2 + +-- !hive1_orc_5_true -- +2 + +-- !hive1_orc_6_true -- +10 +20 + +-- !hive1_orc_7_true -- +10 1 +20 1 + +-- !hive1_orc_8_true -- +true 10 hello world + +-- !hive1_orc_9_true -- +false 20 keep + +-- !hive1_orc_10_true -- +false 20 keep + +-- !hive1_orc_11_true -- +false 20 keep + +-- !hive1_orc_12_true -- +hello world +keep + +-- !hive1_orc_1_false -- +true 10 hello world +false 20 keep + +-- !hive1_orc_2_false -- +10 hello world true +20 keep false + +-- !hive1_orc_3_false -- +hello world true +keep false + +-- !hive1_orc_4_false -- +2 + +-- !hive1_orc_5_false -- +2 + +-- !hive1_orc_6_false -- +10 +20 + +-- !hive1_orc_7_false -- +10 1 +20 1 + +-- !hive1_orc_8_false -- +true 10 hello world + +-- !hive1_orc_9_false -- +false 20 keep + +-- !hive1_orc_10_false -- +false 20 keep + +-- !hive1_orc_11_false -- +false 20 keep + +-- !hive1_orc_12_false -- +hello world +keep + +-- !rename_orc_1_true -- +\N \N hello world \N \N +\N \N keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_orc_2_true -- +\N +\N +30 +40 +50 +60 +70 +80 + +-- !rename_orc_3_true -- +\N 2 +30 1 +40 1 +50 1 +60 1 +70 1 +80 1 + +-- !rename_orc_4_true -- +true 30 abcd \N \N +true 50 xxx \N cols +true 70 hahaha 8888 abcd + +-- !rename_orc_5_true -- +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_orc_6_true -- +\N \N hello world \N \N +\N \N keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy + +-- !rename_orc_7_true -- +true 30 abcd \N \N + +-- !rename_orc_8_true -- +true +true +true + +-- !rename_orc_9_true -- + +-- !rename_orc_10_true -- + +-- !rename_orc_11_true -- +\N \N +\N \N +30 true +40 false +50 true +60 false +70 true +80 false + +-- !rename_orc_12_true -- +\N \N hello world \N \N +\N \N keep \N \N +\N \N abcd 30 true +\N \N new adcd 40 false +cols \N xxx 50 true +yyyyyy \N yyy 60 false +abcd 8888 hahaha 70 true +efg 9999 cmake 80 false + +-- !rename_orc_13_true -- +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_orc_1_false -- +true 10 hello world \N \N +false 20 keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_orc_2_false -- +10 +20 +30 +40 +50 +60 +70 +80 + +-- !rename_orc_3_false -- +10 1 +20 1 +30 1 +40 1 +50 1 +60 1 +70 1 +80 1 + +-- !rename_orc_4_false -- +true 10 hello world \N \N +true 30 abcd \N \N +true 50 xxx 60 cols +true 70 hahaha 8888 abcd + +-- !rename_orc_5_false -- +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_orc_6_false -- +true 10 hello world \N \N +false 20 keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N + +-- !rename_orc_7_false -- +true 30 abcd \N \N + +-- !rename_orc_8_false -- +true +true +true +true + +-- !rename_orc_9_false -- + +-- !rename_orc_10_false -- + +-- !rename_orc_11_false -- +10 true +20 false +30 true +40 false +50 true +60 false +70 true +80 false + +-- !rename_orc_12_false -- +\N \N hello world 10 true +\N \N keep 20 false +\N \N abcd 30 true +\N \N new adcd 40 false +cols 60 xxx 50 true +yyyyyy 100 yyy 60 false +abcd 8888 hahaha 70 true +efg 9999 cmake 80 false + +-- !rename_orc_13_false -- +true 10 hello world \N \N +false 20 keep \N \N +false 40 new adcd \N \N +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_1_true -- +\N \N hello world \N \N +\N \N keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_2_true -- +\N +\N +30 +40 +50 +60 +70 +80 + +-- !rename_parquet_3_true -- +\N 2 +30 1 +40 1 +50 1 +60 1 +70 1 +80 1 + +-- !rename_parquet_4_true -- +true 30 abcd \N \N +true 50 xxx \N cols +true 70 hahaha 8888 abcd + +-- !rename_parquet_5_true -- +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_6_true -- +\N \N hello world \N \N +\N \N keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy + +-- !rename_parquet_7_true -- +true 30 abcd \N \N + +-- !rename_parquet_8_true -- +true +true +true + +-- !rename_parquet_9_true -- + +-- !rename_parquet_10_true -- + +-- !rename_parquet_11_true -- +\N \N +\N \N +30 true +40 false +50 true +60 false +70 true +80 false + +-- !rename_parquet_12_true -- +\N \N hello world \N \N +\N \N keep \N \N +\N \N abcd 30 true +\N \N new adcd 40 false +cols \N xxx 50 true +yyyyyy \N yyy 60 false +abcd 8888 hahaha 70 true +efg 9999 cmake 80 false + +-- !rename_parquet_13_true -- +false 40 new adcd \N \N +true 50 xxx \N cols +false 60 yyy \N yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_1_false -- +true 10 hello world \N \N +false 20 keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_2_false -- +10 +20 +30 +40 +50 +60 +70 +80 + +-- !rename_parquet_3_false -- +10 1 +20 1 +30 1 +40 1 +50 1 +60 1 +70 1 +80 1 + +-- !rename_parquet_4_false -- +true 10 hello world \N \N +true 30 abcd \N \N +true 50 xxx 60 cols +true 70 hahaha 8888 abcd + +-- !rename_parquet_5_false -- +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + +-- !rename_parquet_6_false -- +true 10 hello world \N \N +false 20 keep \N \N +true 30 abcd \N \N +false 40 new adcd \N \N + +-- !rename_parquet_7_false -- +true 30 abcd \N \N + +-- !rename_parquet_8_false -- +true +true +true +true + +-- !rename_parquet_9_false -- + +-- !rename_parquet_10_false -- + +-- !rename_parquet_11_false -- +10 true +20 false +30 true +40 false +50 true +60 false +70 true +80 false + +-- !rename_parquet_12_false -- +\N \N hello world 10 true +\N \N keep 20 false +\N \N abcd 30 true +\N \N new adcd 40 false +cols 60 xxx 50 true +yyyyyy 100 yyy 60 false +abcd 8888 hahaha 70 true +efg 9999 cmake 80 false + +-- !rename_parquet_13_false -- +true 10 hello world \N \N +false 20 keep \N \N +false 40 new adcd \N \N +true 50 xxx 60 cols +false 60 yyy 100 yyyyyy +true 70 hahaha 8888 abcd +false 80 cmake 9999 efg + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_rename_column_orc_parquet.groovy b/regression-test/suites/external_table_p0/hive/test_hive_rename_column_orc_parquet.groovy new file mode 100644 index 00000000000000..88d8a586e6847e --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_hive_rename_column_orc_parquet.groovy @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_hive_rename_column_orc_parquet", "p0,external,hive,external_docker,external_docker_hive") { + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String hivePrefix ="hive3"; + setHivePrefix(hivePrefix) + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hmsPort = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort") + + String catalog_name = "test_hive_schema_change2" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}', + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}' + ); + """ + + sql """ switch ${catalog_name} """ + sql """ use `default` """ + + + sql """ set hive_orc_use_column_names=true; """ + qt_hive1_orc_1_true """ select * from simulation_hive1_orc order by b; """ + qt_hive1_orc_2_true """ select b,c,a from simulation_hive1_orc order by b; """ + qt_hive1_orc_3_true """ select c,a from simulation_hive1_orc order by b; """ + qt_hive1_orc_4_true """ select count(*) from simulation_hive1_orc; """ + qt_hive1_orc_5_true """ select count(a) from simulation_hive1_orc; """ + qt_hive1_orc_6_true """ select b from simulation_hive1_orc order by b; """ + qt_hive1_orc_7_true """ select b,count(*) from simulation_hive1_orc group by b order by b; """ + qt_hive1_orc_8_true """ select * from simulation_hive1_orc where a +b = 11 ; """ + qt_hive1_orc_9_true """ select * from simulation_hive1_orc where a +b != 11 ; """ + qt_hive1_orc_10_true """ select * from simulation_hive1_orc where a +b != 11 and c = "keep"; """ + qt_hive1_orc_11_true """ select * from simulation_hive1_orc where a +b != 11 and c != "keepxxx"; """ + qt_hive1_orc_12_true """ select c from simulation_hive1_orc order by c; """ + + + sql """ set hive_orc_use_column_names=false; """ + qt_hive1_orc_1_false """ select * from simulation_hive1_orc order by b; """ + qt_hive1_orc_2_false """ select b,c,a from simulation_hive1_orc order by b; """ + qt_hive1_orc_3_false """ select c,a from simulation_hive1_orc order by b; """ + qt_hive1_orc_4_false """ select count(*) from simulation_hive1_orc; """ + qt_hive1_orc_5_false """ select count(a) from simulation_hive1_orc; """ + qt_hive1_orc_6_false """ select b from simulation_hive1_orc order by b; """ + qt_hive1_orc_7_false """ select b,count(*) from simulation_hive1_orc group by b order by b; """ + qt_hive1_orc_8_false """ select * from simulation_hive1_orc where a +b = 11 ; """ + qt_hive1_orc_9_false """ select * from simulation_hive1_orc where a +b != 11 ; """ + qt_hive1_orc_10_false """ select * from simulation_hive1_orc where a +b != 11 and c = "keep"; """ + qt_hive1_orc_11_false """ select * from simulation_hive1_orc where a +b != 11 and c != "keepxxx"; """ + qt_hive1_orc_12_false """ select c from simulation_hive1_orc order by c; """ + + + sql """ set hive_orc_use_column_names=true; """ + qt_rename_orc_1_true """ select * from test_hive_rename_column_orc order by new_b,c """; + qt_rename_orc_2_true """ select new_b from test_hive_rename_column_orc order by new_b,c """; + qt_rename_orc_3_true """ select new_b,count(*) from test_hive_rename_column_orc group by new_b order by new_b """; + qt_rename_orc_4_true """ select * from test_hive_rename_column_orc where new_a = 1 order by new_b,c """; + qt_rename_orc_5_true """ select * from test_hive_rename_column_orc where new_d is not null order by new_b,c """ + qt_rename_orc_6_true """ select * from test_hive_rename_column_orc where new_d is null order by new_b,c; """ + qt_rename_orc_7_true """ select * from test_hive_rename_column_orc where new_b + new_a = 31 order by new_b,c; """ + qt_rename_orc_8_true """ select new_a from test_hive_rename_column_orc where new_a = 1 order by new_b,c; """ + qt_rename_orc_9_true """ select new_b from test_hive_rename_column_orc where new_b = 1 order by new_b; """ + qt_rename_orc_10_true """ select new_b,new_d from test_hive_rename_column_orc where new_d +30*new_b=100 order by new_b,c; """ + qt_rename_orc_11_true """ select new_b,new_a from test_hive_rename_column_orc order by new_b,c,new_a; """ + qt_rename_orc_12_true """ select f,new_d,c,new_b,new_a from test_hive_rename_column_orc order by new_b,c; """ + qt_rename_orc_13_true """ select * from test_hive_rename_column_orc where new_b + new_a != 31 order by new_b,c; """ + + + + + sql """ set hive_orc_use_column_names=false; """ + qt_rename_orc_1_false """ select * from test_hive_rename_column_orc order by new_b,c """; + qt_rename_orc_2_false """ select new_b from test_hive_rename_column_orc order by new_b,c """; + qt_rename_orc_3_false """ select new_b,count(*) from test_hive_rename_column_orc group by new_b order by new_b """; + qt_rename_orc_4_false """ select * from test_hive_rename_column_orc where new_a = 1 order by new_b,c """; + qt_rename_orc_5_false """ select * from test_hive_rename_column_orc where new_d is not null order by new_b """ + qt_rename_orc_6_false """ select * from test_hive_rename_column_orc where new_d is null order by new_b,c; """ + qt_rename_orc_7_false """ select * from test_hive_rename_column_orc where new_b + new_a = 31 order by new_b,c; """ + qt_rename_orc_8_false """ select new_a from test_hive_rename_column_orc where new_a = 1 order by new_b,c; """ + qt_rename_orc_9_false """ select new_b from test_hive_rename_column_orc where new_b = 1 order by new_b; """ + qt_rename_orc_10_false """ select new_b,new_d from test_hive_rename_column_orc where new_d +30*new_b=100 order by new_b,c; """ + qt_rename_orc_11_false """ select new_b,new_a from test_hive_rename_column_orc order by new_b,c,new_a; """ + qt_rename_orc_12_false """ select f,new_d,c,new_b,new_a from test_hive_rename_column_orc order by new_b,c; """ + qt_rename_orc_13_false """ select * from test_hive_rename_column_orc where new_b + new_a != 31 order by new_b,c; """ + + + sql """ set hive_parquet_use_column_names=true; """ + qt_rename_parquet_1_true """ select * from test_hive_rename_column_parquet order by new_b,c """; + qt_rename_parquet_2_true """ select new_b from test_hive_rename_column_parquet order by new_b,c """; + qt_rename_parquet_3_true """ select new_b,count(*) from test_hive_rename_column_parquet group by new_b order by new_b """; + qt_rename_parquet_4_true """ select * from test_hive_rename_column_parquet where new_a = 1 order by new_b,c """; + qt_rename_parquet_5_true """ select * from test_hive_rename_column_parquet where new_d is not null order by new_b,c """ + qt_rename_parquet_6_true """ select * from test_hive_rename_column_parquet where new_d is null order by new_b,c; """ + qt_rename_parquet_7_true """ select * from test_hive_rename_column_parquet where new_b + new_a = 31 order by new_b,c; """ + qt_rename_parquet_8_true """ select new_a from test_hive_rename_column_parquet where new_a = 1 order by new_b,c; """ + qt_rename_parquet_9_true """ select new_b from test_hive_rename_column_parquet where new_b = 1 order by new_b; """ + qt_rename_parquet_10_true """ select new_b,new_d from test_hive_rename_column_parquet where new_d +30*new_b=100 order by new_b,c; """ + qt_rename_parquet_11_true """ select new_b,new_a from test_hive_rename_column_parquet order by new_b,c,new_a; """ + qt_rename_parquet_12_true """ select f,new_d,c,new_b,new_a from test_hive_rename_column_parquet order by new_b,c; """ + qt_rename_parquet_13_true """ select * from test_hive_rename_column_parquet where new_b + new_a != 31 order by new_b,c; """ + + + + + sql """ set hive_parquet_use_column_names=false; """ + qt_rename_parquet_1_false """ select * from test_hive_rename_column_parquet order by new_b,c """; + qt_rename_parquet_2_false """ select new_b from test_hive_rename_column_parquet order by new_b,c """; + qt_rename_parquet_3_false """ select new_b,count(*) from test_hive_rename_column_parquet group by new_b order by new_b """; + qt_rename_parquet_4_false """ select * from test_hive_rename_column_parquet where new_a = 1 order by new_b,c """; + qt_rename_parquet_5_false """ select * from test_hive_rename_column_parquet where new_d is not null order by new_b,c """ + qt_rename_parquet_6_false """ select * from test_hive_rename_column_parquet where new_d is null order by new_b,c; """ + qt_rename_parquet_7_false """ select * from test_hive_rename_column_parquet where new_b + new_a = 31 order by new_b,c; """ + qt_rename_parquet_8_false """ select new_a from test_hive_rename_column_parquet where new_a = 1 order by new_b,c; """ + qt_rename_parquet_9_false """ select new_b from test_hive_rename_column_parquet where new_b = 1 order by new_b; """ + qt_rename_parquet_10_false """ select new_b,new_d from test_hive_rename_column_parquet where new_d +30*new_b=100 order by new_b,c; """ + qt_rename_parquet_11_false """ select new_b,new_a from test_hive_rename_column_parquet order by new_b,c,new_a; """ + qt_rename_parquet_12_false """ select f,new_d,c,new_b,new_a from test_hive_rename_column_parquet order by new_b,c; """ + qt_rename_parquet_13_false """ select * from test_hive_rename_column_parquet where new_b + new_a != 31 order by new_b,c; """ + + + + + + } +} +/* +CREATE TABLE simulation_hive1_orc( + `_col0` boolean, + `_col1` INT, + `_col2` STRING +)stored as orc; +insert into simulation_hive1_orc values(true,10,"hello world"),(false,20,"keep"); +select * from simulation_hive1_orc; +alter table simulation_hive1_orc change column `_col0` a boolean; +alter table simulation_hive1_orc change column `_col1` b int; +alter table simulation_hive1_orc change column `_col2` c string; +select * from simulation_hive1_orc; +show create table simulation_hive1_orc; + + +CREATE TABLE test_hive_rename_column_orc( + a boolean, + b INT, + c STRING +)stored as orc; +insert into test_hive_rename_column_orc values (true,10,"hello world"),(false,20,"keep"); +alter table test_hive_rename_column_orc change column a new_a boolean; +alter table test_hive_rename_column_orc change column b new_b int; +insert into test_hive_rename_column_orc values (true,30,"abcd"),(false,40,"new adcd"); +select * from test_hive_rename_column_orc; +alter table test_hive_rename_column_orc add columns(d int,f string); +insert into test_hive_rename_column_orc values (true,50,"xxx",60,"cols"),(false,60,"yyy",100,"yyyyyy"); +alter table test_hive_rename_column_orc change column d new_d int; +insert into test_hive_rename_column_orc values (true,70,"hahaha",8888,"abcd"),(false,80,"cmake",9999,"efg"); +select * from test_hive_rename_column_orc; +show create table test_hive_rename_column_orc; + + + +CREATE TABLE test_hive_rename_column_parquet( + a boolean, + b INT, + c STRING +)stored as parquet; +insert into test_hive_rename_column_parquet values (true,10,"hello world"),(false,20,"keep"); +alter table test_hive_rename_column_parquet change column a new_a boolean; +alter table test_hive_rename_column_parquet change column b new_b int; +insert into test_hive_rename_column_parquet values (true,30,"abcd"),(false,40,"new adcd"); +select * from test_hive_rename_column_parquet; +alter table test_hive_rename_column_parquet add columns(d int,f string); +insert into test_hive_rename_column_parquet values (true,50,"xxx",60,"cols"),(false,60,"yyy",100,"yyyyyy"); +alter table test_hive_rename_column_parquet change column d new_d int; +insert into test_hive_rename_column_parquet values (true,70,"hahaha",8888,"abcd"),(false,80,"cmake",9999,"efg"); +select * from test_hive_rename_column_parquet; +show create table test_hive_rename_column_parquet; +*/ \ No newline at end of file