Skip to content

Commit

Permalink
[Opt](Serde) optimize serialization to string on variant type (apache…
Browse files Browse the repository at this point in the history
…#43237)

1. avoid sanitize type each time serialization one row
2. use type id to compare instead of compare type name

![image](https://github.com/user-attachments/assets/ad056c73-8a50-49c9-a670-4750b9609675)

`select count(cast(payload["issue"] as string))  from gharchive`

before 101s
after 15s
  • Loading branch information
eldenmoon authored and eldenmoon committed Nov 7, 2024
1 parent e682fa2 commit 8c89e13
Showing 1 changed file with 19 additions and 15 deletions.
34 changes: 19 additions & 15 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1309,14 +1309,14 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
// 2. nested array with only nulls, eg. [null. null],todo: think a better way to deal distinguish array null value and real null value.
// 3. empty root jsonb value(not null)
// 4. type is nothing
bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type, int row,
const PathInData& path) {
bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type,
TypeIndex base_type_id, int row, const PathInData& path) {
// skip nulls
if (nullable && nullable->is_null_at(row)) {
return true;
}
// check if it is empty nested json array, then skip
if (type->equals(*ColumnObject::NESTED_TYPE)) {
if (base_type_id == TypeIndex::VARIANT && type->equals(*ColumnObject::NESTED_TYPE)) {
Field field = (*nullable)[row];
if (field.get_type() == Field::Types::Array) {
const auto& array = field.get<Array>();
Expand All @@ -1336,25 +1336,27 @@ bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type, in
return true;
}
// skip nothing type
if (WhichDataType(remove_nullable(get_base_type_of_array(type))).is_nothing()) {
if (base_type_id == TypeIndex::Nothing) {
return true;
}
return false;
}

Status find_and_set_leave_value(const IColumn* column, const PathInData& path,
const DataTypeSerDeSPtr& type_serde, const DataTypePtr& type,
rapidjson::Value& root,
TypeIndex base_type_index, rapidjson::Value& root,
rapidjson::Document::AllocatorType& allocator, Arena& mem_pool,
int row) {
#ifndef NDEBUG
// sanitize type and column
if (column->get_name() != type->create_column()->get_name()) {
return Status::InternalError(
"failed to set value for path {}, expected type {}, but got {} at row {}",
path.get_path(), type->get_name(), column->get_name(), row);
}
#endif
const auto* nullable = check_and_get_column<ColumnNullable>(column);
if (skip_empty_json(nullable, type, row, path)) {
if (skip_empty_json(nullable, type, base_type_index, row, path)) {
return Status::OK();
}
// TODO could cache the result of leaf nodes with it's path info
Expand Down Expand Up @@ -1474,11 +1476,12 @@ Status ColumnObject::serialize_one_row_to_json_format(int row, rapidjson::String
VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure);
#endif
for (const auto& subcolumn : subcolumns) {
RETURN_IF_ERROR(find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(),
subcolumn->path,
subcolumn->data.get_least_common_type_serde(),
subcolumn->data.get_least_common_type(), root,
doc_structure->GetAllocator(), mem_pool, row));
RETURN_IF_ERROR(find_and_set_leave_value(
subcolumn->data.get_finalized_column_ptr(), subcolumn->path,
subcolumn->data.get_least_common_type_serde(),
subcolumn->data.get_least_common_type(),
subcolumn->data.least_common_type.get_base_type_id(), root,
doc_structure->GetAllocator(), mem_pool, row));
if (subcolumn->path.empty() && !root.IsObject()) {
// root was modified, only handle root node
break;
Expand Down Expand Up @@ -1547,10 +1550,11 @@ Status ColumnObject::merge_sparse_to_root_column() {
++null_count;
continue;
}
bool succ = find_and_set_leave_value(column, subcolumn->path,
subcolumn->data.get_least_common_type_serde(),
subcolumn->data.get_least_common_type(), root,
doc_structure->GetAllocator(), mem_pool, i);
bool succ = find_and_set_leave_value(
column, subcolumn->path, subcolumn->data.get_least_common_type_serde(),
subcolumn->data.get_least_common_type(),
subcolumn->data.least_common_type.get_base_type_id(), root,
doc_structure->GetAllocator(), mem_pool, i);
if (succ && subcolumn->path.empty() && !root.IsObject()) {
// root was modified, only handle root node
break;
Expand Down

0 comments on commit 8c89e13

Please sign in to comment.