From 024348d74b2f6d3bdbd2f2866b9de8668767e5de Mon Sep 17 00:00:00 2001 From: ZHAO Chun Date: Wed, 9 Oct 2019 22:31:27 +0800 Subject: [PATCH] Enable auto convert when check in (#1926) Leverage gitattributes to enable auto convert end-of-line to LF when checking in. Convert already exist CRLF to LF by removing all files and checking out with new .gitattributes file. Except .gitattributes, all files are only modified at the end of line. --- .gitattributes | 16 + be/src/exec/es_scan_node.cpp | 1754 ++++++++--------- be/src/exec/es_scan_node.h | 184 +- be/src/gutil/cpu.cc | 572 +++--- be/src/gutil/cpu.h | 180 +- be/src/olap/rowset/rowset_writer_context.h | 148 +- .../olap/rowset/segment_v2/bitshuffle_page.h | 684 +++---- .../rowset/segment_v2/bitshuffle_wrapper.cpp | 162 +- .../rowset/segment_v2/bitshuffle_wrapper.h | 68 +- be/src/olap/rowset/segment_v2/page_builder.h | 174 +- be/src/olap/rowset/segment_v2/page_decoder.h | 158 +- be/src/olap/rowset/segment_v2/rle_page.h | 512 ++--- be/src/udf/CMakeLists.txt | 64 +- be/src/util/alignment.h | 52 +- be/src/util/bit_stream_utils.h | 298 +-- be/src/util/bit_stream_utils.inline.h | 426 ++-- be/src/util/faststring.cc | 144 +- be/src/util/faststring.h | 514 ++--- be/src/util/rle_encoding.h | 1042 +++++----- be/test/exec/es_scan_node_test.cpp | 308 +-- .../segment_v2/bitshuffle_page_test.cpp | 458 ++--- .../olap/rowset/segment_v2/rle_page_test.cpp | 386 ++-- be/test/util/faststring_test.cpp | 166 +- be/test/util/rle_encoding_test.cpp | 852 ++++---- .../cn/community/subscribe-mail-list.md | 74 +- .../cn/internal/doris_storage_optimization.md | 412 ++-- .../internal/doris_storage_optimization_EN.md | 168 +- 27 files changed, 4996 insertions(+), 4980 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000000..6c01695fd205ef --- /dev/null +++ b/.gitattributes @@ -0,0 +1,16 @@ +# Use whitelist to set text on +# text means convert to LF when check in +# eol=lf means convert to LF when check out +*.cpp text eol=lf +*.cc text eol=lf +*.c text eol=lf +*.h text eol=lf +*.java text eol=lf +*.py text eol=lf +*.js text eol=lf +*.md text eol=lf +*.txt text eol=lf +*.sh text eol=lf +*.thrift text eol=lf +*.proto text eol=lf +*.conf text eol=lf diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index d70fbfc0084f79..c3e426c6955748 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -1,877 +1,877 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "es_scan_node.h" - -#include -#include -#include - -#include "gen_cpp/PlanNodes_types.h" -#include "gen_cpp/Exprs_types.h" -#include "runtime/runtime_state.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "runtime/client_cache.h" -#include "util/runtime_profile.h" -#include "util/debug_util.h" -#include "service/backend_options.h" -#include "olap/olap_common.h" -#include "olap/utils.h" -#include "exprs/expr_context.h" -#include "exprs/expr.h" -#include "exprs/in_predicate.h" -#include "exprs/slot_ref.h" - -namespace doris { - -// $0 = column type (e.g. INT) -const string ERROR_INVALID_COL_DATA = "Data source returned inconsistent column data. " - "Expected value of type $0 based on column metadata. This likely indicates a " - "problem with the data source library."; -const string ERROR_MEM_LIMIT_EXCEEDED = "DataSourceScanNode::$0() failed to allocate " - "$1 bytes for $2."; - -EsScanNode::EsScanNode( - ObjectPool* pool, - const TPlanNode& tnode, - const DescriptorTbl& descs) : - ScanNode(pool, tnode, descs), - _tuple_id(tnode.es_scan_node.tuple_id), - _scan_range_idx(0) { - if (tnode.es_scan_node.__isset.properties) { - _properties = tnode.es_scan_node.properties; - } -} - -EsScanNode::~EsScanNode() { -} - -Status EsScanNode::prepare(RuntimeState* state) { - VLOG(1) << "EsScanNode::Prepare"; - - RETURN_IF_ERROR(ScanNode::prepare(state)); - _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); - if (_tuple_desc == nullptr) { - std::stringstream ss; - ss << "es tuple descriptor is null, _tuple_id=" << _tuple_id; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - _env = state->exec_env(); - - return Status::OK(); -} - -Status EsScanNode::open(RuntimeState* state) { - VLOG(1) << "EsScanNode::Open"; - - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); - RETURN_IF_CANCELLED(state); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(ExecNode::open(state)); - - // TExtOpenParams.row_schema - vector cols; - for (const SlotDescriptor* slot : _tuple_desc->slots()) { - TExtColumnDesc col; - col.__set_name(slot->col_name()); - col.__set_type(slot->type().to_thrift()); - cols.emplace_back(std::move(col)); - } - TExtTableSchema row_schema; - row_schema.cols = std::move(cols); - row_schema.__isset.cols = true; - - // TExtOpenParams.predicates - vector > predicates; - vector predicate_to_conjunct; - for (int i = 0; i < _conjunct_ctxs.size(); ++i) { - VLOG(1) << "conjunct: " << _conjunct_ctxs[i]->root()->debug_string(); - vector disjuncts; - if (get_disjuncts(_conjunct_ctxs[i], _conjunct_ctxs[i]->root(), disjuncts)) { - predicates.emplace_back(std::move(disjuncts)); - predicate_to_conjunct.push_back(i); - } - } - - // open every scan range - vector conjunct_accepted_times(_conjunct_ctxs.size(), 0); - for (int i = 0; i < _scan_ranges.size(); ++i) { - TEsScanRange& es_scan_range = _scan_ranges[i]; - - if (es_scan_range.es_hosts.empty()) { - std::stringstream ss; - ss << "es fail to open: hosts empty"; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - - - // TExtOpenParams - TExtOpenParams params; - params.__set_query_id(state->query_id()); - _properties["index"] = es_scan_range.index; - if (es_scan_range.__isset.type) { - _properties["type"] = es_scan_range.type; - } - _properties["shard_id"] = std::to_string(es_scan_range.shard_id); - params.__set_properties(_properties); - params.__set_row_schema(row_schema); - params.__set_batch_size(state->batch_size()); - params.__set_predicates(predicates); - TExtOpenResult result; - - // choose an es node, local is the first choice - std::string localhost = BackendOptions::get_localhost(); - bool is_success = false; - for (int j = 0; j < 2; ++j) { - for (auto& es_host : es_scan_range.es_hosts) { - if ((j == 0 && es_host.hostname != localhost) - || (j == 1 && es_host.hostname == localhost)) { - continue; - } - Status status = open_es(es_host, result, params); - if (status.ok()) { - is_success = true; - _addresses.push_back(es_host); - _scan_handles.push_back(result.scan_handle); - if (result.__isset.accepted_conjuncts) { - for (int index : result.accepted_conjuncts) { - conjunct_accepted_times[predicate_to_conjunct[index]]++; - } - } - break; - } else if (status.code() == TStatusCode::ES_SHARD_NOT_FOUND) { - // if shard not found, try other nodes - LOG(WARNING) << "shard not found on es node: " - << ", address=" << es_host - << ", scan_range_idx=" << i << ", try other nodes"; - } else { - LOG(WARNING) << "es open error: scan_range_idx=" << i - << ", address=" << es_host - << ", msg=" << status.get_error_msg(); - return status; - } - } - if (is_success) { - break; - } - } - - if (!is_success) { - std::stringstream ss; - ss << "es open error: scan_range_idx=" << i - << ", can't find shard on any node"; - return Status::InternalError(ss.str()); - } - } - - // remove those conjuncts that accepted by all scan ranges - for (int i = predicate_to_conjunct.size() - 1; i >= 0; i--) { - int conjunct_index = predicate_to_conjunct[i]; - if (conjunct_accepted_times[conjunct_index] == _scan_ranges.size()) { - _pushdown_conjunct_ctxs.push_back(*(_conjunct_ctxs.begin() + conjunct_index)); - _conjunct_ctxs.erase(_conjunct_ctxs.begin() + conjunct_index); - } - } - - for (int i = 0; i < _conjunct_ctxs.size(); ++i) { - if (!check_left_conjuncts(_conjunct_ctxs[i]->root())) { - return Status::InternalError("esquery could only be executed on es, but could not push down to es"); - } - } - - return Status::OK(); -} - -Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - VLOG(1) << "EsScanNode::GetNext"; - - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); - RETURN_IF_CANCELLED(state); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - SCOPED_TIMER(materialize_tuple_timer()); - - // create tuple - MemPool* tuple_pool = row_batch->tuple_data_pool(); - int64_t tuple_buffer_size; - uint8_t* tuple_buffer = nullptr; - RETURN_IF_ERROR(row_batch->resize_and_allocate_tuple_buffer(state, &tuple_buffer_size, &tuple_buffer)); - Tuple* tuple = reinterpret_cast(tuple_buffer); - - // get batch - TExtGetNextResult result; - RETURN_IF_ERROR(get_next_from_es(result)); - _offsets[_scan_range_idx] += result.rows.num_rows; - - // convert - VLOG(1) << "begin to convert: scan_range_idx=" << _scan_range_idx - << ", num_rows=" << result.rows.num_rows; - vector& cols = result.rows.cols; - // indexes of the next non-null value in the row batch, per column. - vector cols_next_val_idx(_tuple_desc->slots().size(), 0); - for (int row_idx = 0; row_idx < result.rows.num_rows; row_idx++) { - if (reached_limit()) { - *eos = true; - break; - } - RETURN_IF_ERROR(materialize_row(tuple_pool, tuple, cols, row_idx, cols_next_val_idx)); - TupleRow* tuple_row = row_batch->get_row(row_batch->add_row()); - tuple_row->set_tuple(0, tuple); - if (ExecNode::eval_conjuncts(_conjunct_ctxs.data(), _conjunct_ctxs.size(), tuple_row)) { - row_batch->commit_last_row(); - tuple = reinterpret_cast( - reinterpret_cast(tuple) + _tuple_desc->byte_size()); - ++_num_rows_returned; - } - } - - VLOG(1) << "finish one batch: num_rows=" << row_batch->num_rows(); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); - if (result.__isset.eos && result.eos) { - VLOG(1) << "es finish one scan_range: scan_range_idx=" << _scan_range_idx; - ++_scan_range_idx; - } - if (_scan_range_idx == _scan_ranges.size()) { - *eos = true; - } - - return Status::OK(); -} - -Status EsScanNode::close(RuntimeState* state) { - if (is_closed()) return Status::OK(); - VLOG(1) << "EsScanNode::Close"; - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - Expr::close(_pushdown_conjunct_ctxs, state); - RETURN_IF_ERROR(ExecNode::close(state)); - for (int i = 0; i < _addresses.size(); ++i) { - TExtCloseParams params; - params.__set_scan_handle(_scan_handles[i]); - TExtCloseResult result; - -#ifndef BE_TEST - const TNetworkAddress& address = _addresses[i]; - try { - Status status; - ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); - ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); - if (!status.ok()) { - LOG(WARNING) << "es create client error: scan_range_idx=" << i - << ", address=" << address - << ", msg=" << status.get_error_msg(); - return status; - } - - try { - VLOG(1) << "es close param=" << apache::thrift::ThriftDebugString(params); - client->close(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - LOG(WARNING) << "es close retrying, because: " << e.what(); - RETURN_IF_ERROR(client.reopen()); - client->close(result, params); - } - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es close error: scan_range_idx=" << i - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - return Status::ThriftRpcError(ss.str()); - } - - VLOG(1) << "es close result=" << apache::thrift::ThriftDebugString(result); - Status status(result.status); - if (!status.ok()) { - LOG(WARNING) << "es close error: : scan_range_idx=" << i - << ", msg=" << status.get_error_msg(); - return status; - } -#else - TStatus status; - result.__set_status(status); -#endif - } - - return Status::OK(); -} - -void EsScanNode::debug_string(int indentation_level, stringstream* out) const { - *out << string(indentation_level * 2, ' '); - *out << "EsScanNode(tupleid=" << _tuple_id; - *out << ")" << std::endl; - - for (int i = 0; i < _children.size(); ++i) { - _children[i]->debug_string(indentation_level + 1, out); - } -} - -Status EsScanNode::set_scan_ranges(const vector& scan_ranges) { - for (int i = 0; i < scan_ranges.size(); ++i) { - TScanRangeParams scan_range = scan_ranges[i]; - DCHECK(scan_range.scan_range.__isset.es_scan_range); - TEsScanRange es_scan_range = scan_range.scan_range.es_scan_range; - _scan_ranges.push_back(es_scan_range); - } - - _offsets.resize(scan_ranges.size(), 0); - return Status::OK(); -} - -Status EsScanNode::open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params) { - - VLOG(1) << "es open param=" << apache::thrift::ThriftDebugString(params); -#ifndef BE_TEST - try { - ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); - Status status; - ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); - if (!status.ok()) { - std::stringstream ss; - ss << "es create client error: address=" << address - << ", msg=" << status.get_error_msg(); - return Status::InternalError(ss.str()); - } - - try { - client->open(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - LOG(WARNING) << "es open retrying, because: " << e.what(); - RETURN_IF_ERROR(client.reopen()); - client->open(result, params); - } - VLOG(1) << "es open result=" << apache::thrift::ThriftDebugString(result); - return Status(result.status); - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es open error: address=" << address << ", msg=" << e.what(); - return Status::InternalError(ss.str()); - } -#else - TStatus status; - result.__set_status(status); - result.__set_scan_handle("0"); - return Status(status); -#endif -} - -// legacy conjuncts must not contain match function -bool EsScanNode::check_left_conjuncts(Expr* conjunct) { - if (is_match_func(conjunct)) { - return false; - } else { - int num_children = conjunct->get_num_children(); - for (int child_idx = 0; child_idx < num_children; ++child_idx) { - if (!check_left_conjuncts(conjunct->get_child(child_idx))) { - return false; - } - } - return true; - } -} - -bool EsScanNode::ignore_cast(SlotDescriptor* slot, Expr* expr) { - if (slot->type().is_date_type() && expr->type().is_date_type()) { - return true; - } - if (slot->type().is_string_type() && expr->type().is_string_type()) { - return true; - } - return false; -} - -bool EsScanNode::get_disjuncts(ExprContext* context, Expr* conjunct, - vector& disjuncts) { - if (TExprNodeType::BINARY_PRED == conjunct->node_type()) { - if (conjunct->children().size() != 2) { - VLOG(1) << "get disjuncts fail: number of childs is not 2"; - return false; - } - SlotRef* slotRef; - TExprOpcode::type op; - Expr* expr; - if (TExprNodeType::SLOT_REF == conjunct->get_child(0)->node_type()) { - expr = conjunct->get_child(1); - slotRef = (SlotRef*)(conjunct->get_child(0)); - op = conjunct->op(); - } else if (TExprNodeType::SLOT_REF == conjunct->get_child(1)->node_type()) { - expr = conjunct->get_child(0); - slotRef = (SlotRef*)(conjunct->get_child(1)); - op = conjunct->op(); - } else { - VLOG(1) << "get disjuncts fail: no SLOT_REF child"; - return false; - } - - SlotDescriptor* slot_desc = get_slot_desc(slotRef); - if (slot_desc == nullptr) { - VLOG(1) << "get disjuncts fail: slot_desc is null"; - return false; - } - - TExtLiteral literal; - if (!to_ext_literal(context, expr, &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << expr->node_type(); - return false; - } - - TExtColumnDesc columnDesc; - columnDesc.__set_name(slot_desc->col_name()); - columnDesc.__set_type(slot_desc->type().to_thrift()); - TExtBinaryPredicate binaryPredicate; - binaryPredicate.__set_col(columnDesc); - binaryPredicate.__set_op(op); - binaryPredicate.__set_value(std::move(literal)); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::BINARY_PRED); - predicate.__set_binary_predicate(binaryPredicate); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (is_match_func(conjunct)) { - // if this is a function call expr and function name is match, then push - // down it to es - TExtFunction match_function; - match_function.__set_func_name(conjunct->fn().name.function_name); - vector query_conditions; - - - TExtLiteral literal; - if (!to_ext_literal(context, conjunct->get_child(1), &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << conjunct->get_child(1)->node_type(); - return false; - } - - query_conditions.push_back(std::move(literal)); - match_function.__set_values(query_conditions); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::FUNCTION_CALL); - predicate.__set_ext_function(match_function); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (TExprNodeType::IN_PRED == conjunct->node_type()) { - // the op code maybe FILTER_NEW_IN, it means there is function in list - // like col_a in (abs(1)) - if (TExprOpcode::FILTER_IN != conjunct->op() - && TExprOpcode::FILTER_NOT_IN != conjunct->op()) { - return false; - } - TExtInPredicate ext_in_predicate; - vector in_pred_values; - InPredicate* pred = dynamic_cast(conjunct); - ext_in_predicate.__set_is_not_in(pred->is_not_in()); - if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { - return false; - } - - SlotRef* slot_ref = (SlotRef*)(conjunct->get_child(0)); - SlotDescriptor* slot_desc = get_slot_desc(slot_ref); - if (slot_desc == nullptr) { - return false; - } - TExtColumnDesc columnDesc; - columnDesc.__set_name(slot_desc->col_name()); - columnDesc.__set_type(slot_desc->type().to_thrift()); - ext_in_predicate.__set_col(columnDesc); - - if (pred->get_child(0)->type().type != slot_desc->type().type) { - if (!ignore_cast(slot_desc, pred->get_child(0))) { - return false; - } - } - - HybirdSetBase::IteratorBase* iter = pred->hybird_set()->begin(); - while (iter->has_next()) { - if (nullptr == iter->get_value()) { - return false; - } - TExtLiteral literal; - if (!to_ext_literal(slot_desc->type().type, const_cast(iter->get_value()), &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << slot_desc->type().type; - return false; - } - in_pred_values.push_back(literal); - iter->next(); - } - ext_in_predicate.__set_values(in_pred_values); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::IN_PRED); - predicate.__set_in_predicate(ext_in_predicate); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (TExprNodeType::COMPOUND_PRED == conjunct->node_type()) { - if (TExprOpcode::COMPOUND_OR != conjunct->op()) { - VLOG(1) << "get disjuncts fail: op is not COMPOUND_OR"; - return false; - } - if (!get_disjuncts(context, conjunct->get_child(0), disjuncts)) { - return false; - } - if (!get_disjuncts(context, conjunct->get_child(1), disjuncts)) { - return false; - } - return true; - } else { - VLOG(1) << "get disjuncts fail: node type is " << conjunct->node_type() - << ", should be BINARY_PRED or COMPOUND_PRED"; - return false; - } -} - -bool EsScanNode::is_match_func(Expr* conjunct) { - if (TExprNodeType::FUNCTION_CALL == conjunct->node_type() - && conjunct->fn().name.function_name == "esquery") { - return true; - } - return false; -} - -SlotDescriptor* EsScanNode::get_slot_desc(SlotRef* slotRef) { - std::vector slot_ids; - slotRef->get_slot_ids(&slot_ids); - SlotDescriptor* slot_desc = nullptr; - for (SlotDescriptor* slot : _tuple_desc->slots()) { - if (slot->id() == slot_ids[0]) { - slot_desc = slot; - break; - } - } - return slot_desc; -} - -bool EsScanNode::to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal) { - switch (expr->node_type()) { - case TExprNodeType::BOOL_LITERAL: - case TExprNodeType::INT_LITERAL: - case TExprNodeType::LARGE_INT_LITERAL: - case TExprNodeType::FLOAT_LITERAL: - case TExprNodeType::DECIMAL_LITERAL: - case TExprNodeType::STRING_LITERAL: - case TExprNodeType::DATE_LITERAL: - return to_ext_literal(expr->type().type, context->get_value(expr, NULL), literal); - default: - return false; - } -} - -bool EsScanNode::to_ext_literal(PrimitiveType slot_type, void* value, TExtLiteral* literal) { - TExprNodeType::type node_type; - switch (slot_type) { - case TYPE_BOOLEAN: { - node_type = (TExprNodeType::BOOL_LITERAL); - TBoolLiteral bool_literal; - bool_literal.__set_value(*reinterpret_cast(value)); - literal->__set_bool_literal(bool_literal); - break; - } - - case TYPE_TINYINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_SMALLINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_INT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_BIGINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - - case TYPE_LARGEINT: { - node_type = (TExprNodeType::LARGE_INT_LITERAL); - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(value), buf, &len); - TLargeIntLiteral large_int_literal; - large_int_literal.__set_value(v); - literal->__set_large_int_literal(large_int_literal); - break; - } - - case TYPE_FLOAT: { - node_type = (TExprNodeType::FLOAT_LITERAL); - TFloatLiteral float_literal; - float_literal.__set_value(*reinterpret_cast(value)); - literal->__set_float_literal(float_literal); - break; - } - case TYPE_DOUBLE: { - node_type = (TExprNodeType::FLOAT_LITERAL); - TFloatLiteral float_literal; - float_literal.__set_value(*reinterpret_cast(value)); - literal->__set_float_literal(float_literal); - break; - } - - case TYPE_DECIMAL: { - node_type = (TExprNodeType::DECIMAL_LITERAL); - TDecimalLiteral decimal_literal; - decimal_literal.__set_value(reinterpret_cast(value)->to_string()); - literal->__set_decimal_literal(decimal_literal); - break; - } - - case TYPE_DATE: - case TYPE_DATETIME: { - node_type = (TExprNodeType::DATE_LITERAL); - const DateTimeValue date_value = *reinterpret_cast(value); - char str[MAX_DTVALUE_STR_LEN]; - date_value.to_string(str); - TDateLiteral date_literal; - date_literal.__set_value(str); - literal->__set_date_literal(date_literal); - break; - } - - case TYPE_CHAR: - case TYPE_VARCHAR: { - node_type = (TExprNodeType::STRING_LITERAL); - TStringLiteral string_literal; - string_literal.__set_value((reinterpret_cast(value))->debug_string()); - literal->__set_string_literal(string_literal); - break; - } - - default: { - DCHECK(false) << "Invalid type."; - return false; - } - } - literal->__set_node_type(node_type); - return true; -} - -Status EsScanNode::get_next_from_es(TExtGetNextResult& result) { - TExtGetNextParams params; - params.__set_scan_handle(_scan_handles[_scan_range_idx]); - params.__set_offset(_offsets[_scan_range_idx]); - - // getNext - const TNetworkAddress &address = _addresses[_scan_range_idx]; -#ifndef BE_TEST - try { - Status create_client_status; - ExtDataSourceServiceClientCache *client_cache = _env->extdatasource_client_cache(); - ExtDataSourceServiceConnection client(client_cache, address, 10000, &create_client_status); - if (!create_client_status.ok()) { - LOG(WARNING) << "es create client error: scan_range_idx=" << _scan_range_idx - << ", address=" << address - << ", msg=" << create_client_status.get_error_msg(); - return create_client_status; - } - - try { - VLOG(1) << "es get_next param=" << apache::thrift::ThriftDebugString(params); - client->getNext(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - RETURN_IF_ERROR(client.reopen()); - return Status::ThriftRpcError(ss.str()); - } - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - return Status::ThriftRpcError(ss.str()); - } -#else - TStatus status; - result.__set_status(status); - result.__set_eos(true); - TExtColumnData col_data; - std::vector is_null; - is_null.push_back(false); - col_data.__set_is_null(is_null); - std::vector int_vals; - int_vals.push_back(1); - int_vals.push_back(2); - col_data.__set_int_vals(int_vals); - std::vector cols; - cols.push_back(col_data); - TExtRowBatch rows; - rows.__set_cols(cols); - rows.__set_num_rows(2); - result.__set_rows(rows); - return Status(status); -#endif - - // check result - VLOG(1) << "es get_next result=" << apache::thrift::ThriftDebugString(result); - Status get_next_status(result.status); - if (!get_next_status.ok()) { - LOG(WARNING) << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", address=" << address - << ", msg=" << get_next_status.get_error_msg(); - return get_next_status; - } - if (!result.__isset.rows || !result.rows.__isset.num_rows) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=rows or num_rows not in result"; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - - return Status::OK(); -} - -Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, - const vector& cols, int row_idx, - vector& cols_next_val_idx) { - tuple->init(_tuple_desc->byte_size()); - - for (int i = 0; i < _tuple_desc->slots().size(); ++i) { - const SlotDescriptor* slot_desc = _tuple_desc->slots()[i]; - - if (!slot_desc->is_materialized()) { - continue; - } - - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - const TExtColumnData& col = cols[i]; - - if (col.is_null[row_idx]) { - tuple->set_null(slot_desc->null_indicator_offset()); - continue; - } else { - tuple->set_not_null(slot_desc->null_indicator_offset()); - } - - int val_idx = cols_next_val_idx[i]++; - switch (slot_desc->type().type) { - case TYPE_CHAR: - case TYPE_VARCHAR: { - if (val_idx >= col.string_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "STRING")); - } - const string& val = col.string_vals[val_idx]; - size_t val_size = val.size(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); - if (UNLIKELY(buffer == NULL)) { - string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", - val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(NULL, details, val_size); - } - memcpy(buffer, val.data(), val_size); - reinterpret_cast(slot)->ptr = buffer; - reinterpret_cast(slot)->len = val_size; - break; - } - case TYPE_TINYINT: - if (val_idx >= col.byte_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TINYINT")); - } - *reinterpret_cast(slot) = col.byte_vals[val_idx]; - break; - case TYPE_SMALLINT: - if (val_idx >= col.short_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "SMALLINT")); - } - *reinterpret_cast(slot) = col.short_vals[val_idx]; - break; - case TYPE_INT: - if (val_idx >= col.int_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "INT")); - } - *reinterpret_cast(slot) = col.int_vals[val_idx]; - break; - case TYPE_BIGINT: - if (val_idx >= col.long_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BIGINT")); - } - *reinterpret_cast(slot) = col.long_vals[val_idx]; - break; - case TYPE_LARGEINT: - if (val_idx >= col.long_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "LARGEINT")); - } - *reinterpret_cast(slot) = col.long_vals[val_idx]; - break; - case TYPE_DOUBLE: - if (val_idx >= col.double_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DOUBLE")); - } - *reinterpret_cast(slot) = col.double_vals[val_idx]; - break; - case TYPE_FLOAT: - if (val_idx >= col.double_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "FLOAT")); - } - *reinterpret_cast(slot) = col.double_vals[val_idx]; - break; - case TYPE_BOOLEAN: - if (val_idx >= col.bool_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BOOLEAN")); - } - *reinterpret_cast(slot) = col.bool_vals[val_idx]; - break; - case TYPE_DATE: - if (val_idx >= col.long_vals.size() || - !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATE")); - } - reinterpret_cast(slot)->cast_to_date(); - break; - case TYPE_DATETIME: { - if (val_idx >= col.long_vals.size() || - !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATETIME")); - } - reinterpret_cast(slot)->set_type(TIME_DATETIME); - break; - } - case TYPE_DECIMAL: { - if (val_idx >= col.binary_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DECIMAL")); - } - const string& val = col.binary_vals[val_idx]; - *reinterpret_cast(slot) = *reinterpret_cast(&val); - break; - } - default: - DCHECK(false); - } - } - return Status::OK(); -} - -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "es_scan_node.h" + +#include +#include +#include + +#include "gen_cpp/PlanNodes_types.h" +#include "gen_cpp/Exprs_types.h" +#include "runtime/runtime_state.h" +#include "runtime/row_batch.h" +#include "runtime/string_value.h" +#include "runtime/tuple_row.h" +#include "runtime/client_cache.h" +#include "util/runtime_profile.h" +#include "util/debug_util.h" +#include "service/backend_options.h" +#include "olap/olap_common.h" +#include "olap/utils.h" +#include "exprs/expr_context.h" +#include "exprs/expr.h" +#include "exprs/in_predicate.h" +#include "exprs/slot_ref.h" + +namespace doris { + +// $0 = column type (e.g. INT) +const string ERROR_INVALID_COL_DATA = "Data source returned inconsistent column data. " + "Expected value of type $0 based on column metadata. This likely indicates a " + "problem with the data source library."; +const string ERROR_MEM_LIMIT_EXCEEDED = "DataSourceScanNode::$0() failed to allocate " + "$1 bytes for $2."; + +EsScanNode::EsScanNode( + ObjectPool* pool, + const TPlanNode& tnode, + const DescriptorTbl& descs) : + ScanNode(pool, tnode, descs), + _tuple_id(tnode.es_scan_node.tuple_id), + _scan_range_idx(0) { + if (tnode.es_scan_node.__isset.properties) { + _properties = tnode.es_scan_node.properties; + } +} + +EsScanNode::~EsScanNode() { +} + +Status EsScanNode::prepare(RuntimeState* state) { + VLOG(1) << "EsScanNode::Prepare"; + + RETURN_IF_ERROR(ScanNode::prepare(state)); + _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); + if (_tuple_desc == nullptr) { + std::stringstream ss; + ss << "es tuple descriptor is null, _tuple_id=" << _tuple_id; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + _env = state->exec_env(); + + return Status::OK(); +} + +Status EsScanNode::open(RuntimeState* state) { + VLOG(1) << "EsScanNode::Open"; + + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); + RETURN_IF_CANCELLED(state); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(ExecNode::open(state)); + + // TExtOpenParams.row_schema + vector cols; + for (const SlotDescriptor* slot : _tuple_desc->slots()) { + TExtColumnDesc col; + col.__set_name(slot->col_name()); + col.__set_type(slot->type().to_thrift()); + cols.emplace_back(std::move(col)); + } + TExtTableSchema row_schema; + row_schema.cols = std::move(cols); + row_schema.__isset.cols = true; + + // TExtOpenParams.predicates + vector > predicates; + vector predicate_to_conjunct; + for (int i = 0; i < _conjunct_ctxs.size(); ++i) { + VLOG(1) << "conjunct: " << _conjunct_ctxs[i]->root()->debug_string(); + vector disjuncts; + if (get_disjuncts(_conjunct_ctxs[i], _conjunct_ctxs[i]->root(), disjuncts)) { + predicates.emplace_back(std::move(disjuncts)); + predicate_to_conjunct.push_back(i); + } + } + + // open every scan range + vector conjunct_accepted_times(_conjunct_ctxs.size(), 0); + for (int i = 0; i < _scan_ranges.size(); ++i) { + TEsScanRange& es_scan_range = _scan_ranges[i]; + + if (es_scan_range.es_hosts.empty()) { + std::stringstream ss; + ss << "es fail to open: hosts empty"; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + + // TExtOpenParams + TExtOpenParams params; + params.__set_query_id(state->query_id()); + _properties["index"] = es_scan_range.index; + if (es_scan_range.__isset.type) { + _properties["type"] = es_scan_range.type; + } + _properties["shard_id"] = std::to_string(es_scan_range.shard_id); + params.__set_properties(_properties); + params.__set_row_schema(row_schema); + params.__set_batch_size(state->batch_size()); + params.__set_predicates(predicates); + TExtOpenResult result; + + // choose an es node, local is the first choice + std::string localhost = BackendOptions::get_localhost(); + bool is_success = false; + for (int j = 0; j < 2; ++j) { + for (auto& es_host : es_scan_range.es_hosts) { + if ((j == 0 && es_host.hostname != localhost) + || (j == 1 && es_host.hostname == localhost)) { + continue; + } + Status status = open_es(es_host, result, params); + if (status.ok()) { + is_success = true; + _addresses.push_back(es_host); + _scan_handles.push_back(result.scan_handle); + if (result.__isset.accepted_conjuncts) { + for (int index : result.accepted_conjuncts) { + conjunct_accepted_times[predicate_to_conjunct[index]]++; + } + } + break; + } else if (status.code() == TStatusCode::ES_SHARD_NOT_FOUND) { + // if shard not found, try other nodes + LOG(WARNING) << "shard not found on es node: " + << ", address=" << es_host + << ", scan_range_idx=" << i << ", try other nodes"; + } else { + LOG(WARNING) << "es open error: scan_range_idx=" << i + << ", address=" << es_host + << ", msg=" << status.get_error_msg(); + return status; + } + } + if (is_success) { + break; + } + } + + if (!is_success) { + std::stringstream ss; + ss << "es open error: scan_range_idx=" << i + << ", can't find shard on any node"; + return Status::InternalError(ss.str()); + } + } + + // remove those conjuncts that accepted by all scan ranges + for (int i = predicate_to_conjunct.size() - 1; i >= 0; i--) { + int conjunct_index = predicate_to_conjunct[i]; + if (conjunct_accepted_times[conjunct_index] == _scan_ranges.size()) { + _pushdown_conjunct_ctxs.push_back(*(_conjunct_ctxs.begin() + conjunct_index)); + _conjunct_ctxs.erase(_conjunct_ctxs.begin() + conjunct_index); + } + } + + for (int i = 0; i < _conjunct_ctxs.size(); ++i) { + if (!check_left_conjuncts(_conjunct_ctxs[i]->root())) { + return Status::InternalError("esquery could only be executed on es, but could not push down to es"); + } + } + + return Status::OK(); +} + +Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + VLOG(1) << "EsScanNode::GetNext"; + + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + RETURN_IF_CANCELLED(state); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + SCOPED_TIMER(materialize_tuple_timer()); + + // create tuple + MemPool* tuple_pool = row_batch->tuple_data_pool(); + int64_t tuple_buffer_size; + uint8_t* tuple_buffer = nullptr; + RETURN_IF_ERROR(row_batch->resize_and_allocate_tuple_buffer(state, &tuple_buffer_size, &tuple_buffer)); + Tuple* tuple = reinterpret_cast(tuple_buffer); + + // get batch + TExtGetNextResult result; + RETURN_IF_ERROR(get_next_from_es(result)); + _offsets[_scan_range_idx] += result.rows.num_rows; + + // convert + VLOG(1) << "begin to convert: scan_range_idx=" << _scan_range_idx + << ", num_rows=" << result.rows.num_rows; + vector& cols = result.rows.cols; + // indexes of the next non-null value in the row batch, per column. + vector cols_next_val_idx(_tuple_desc->slots().size(), 0); + for (int row_idx = 0; row_idx < result.rows.num_rows; row_idx++) { + if (reached_limit()) { + *eos = true; + break; + } + RETURN_IF_ERROR(materialize_row(tuple_pool, tuple, cols, row_idx, cols_next_val_idx)); + TupleRow* tuple_row = row_batch->get_row(row_batch->add_row()); + tuple_row->set_tuple(0, tuple); + if (ExecNode::eval_conjuncts(_conjunct_ctxs.data(), _conjunct_ctxs.size(), tuple_row)) { + row_batch->commit_last_row(); + tuple = reinterpret_cast( + reinterpret_cast(tuple) + _tuple_desc->byte_size()); + ++_num_rows_returned; + } + } + + VLOG(1) << "finish one batch: num_rows=" << row_batch->num_rows(); + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + if (result.__isset.eos && result.eos) { + VLOG(1) << "es finish one scan_range: scan_range_idx=" << _scan_range_idx; + ++_scan_range_idx; + } + if (_scan_range_idx == _scan_ranges.size()) { + *eos = true; + } + + return Status::OK(); +} + +Status EsScanNode::close(RuntimeState* state) { + if (is_closed()) return Status::OK(); + VLOG(1) << "EsScanNode::Close"; + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + Expr::close(_pushdown_conjunct_ctxs, state); + RETURN_IF_ERROR(ExecNode::close(state)); + for (int i = 0; i < _addresses.size(); ++i) { + TExtCloseParams params; + params.__set_scan_handle(_scan_handles[i]); + TExtCloseResult result; + +#ifndef BE_TEST + const TNetworkAddress& address = _addresses[i]; + try { + Status status; + ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); + ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); + if (!status.ok()) { + LOG(WARNING) << "es create client error: scan_range_idx=" << i + << ", address=" << address + << ", msg=" << status.get_error_msg(); + return status; + } + + try { + VLOG(1) << "es close param=" << apache::thrift::ThriftDebugString(params); + client->close(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + LOG(WARNING) << "es close retrying, because: " << e.what(); + RETURN_IF_ERROR(client.reopen()); + client->close(result, params); + } + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es close error: scan_range_idx=" << i + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + return Status::ThriftRpcError(ss.str()); + } + + VLOG(1) << "es close result=" << apache::thrift::ThriftDebugString(result); + Status status(result.status); + if (!status.ok()) { + LOG(WARNING) << "es close error: : scan_range_idx=" << i + << ", msg=" << status.get_error_msg(); + return status; + } +#else + TStatus status; + result.__set_status(status); +#endif + } + + return Status::OK(); +} + +void EsScanNode::debug_string(int indentation_level, stringstream* out) const { + *out << string(indentation_level * 2, ' '); + *out << "EsScanNode(tupleid=" << _tuple_id; + *out << ")" << std::endl; + + for (int i = 0; i < _children.size(); ++i) { + _children[i]->debug_string(indentation_level + 1, out); + } +} + +Status EsScanNode::set_scan_ranges(const vector& scan_ranges) { + for (int i = 0; i < scan_ranges.size(); ++i) { + TScanRangeParams scan_range = scan_ranges[i]; + DCHECK(scan_range.scan_range.__isset.es_scan_range); + TEsScanRange es_scan_range = scan_range.scan_range.es_scan_range; + _scan_ranges.push_back(es_scan_range); + } + + _offsets.resize(scan_ranges.size(), 0); + return Status::OK(); +} + +Status EsScanNode::open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params) { + + VLOG(1) << "es open param=" << apache::thrift::ThriftDebugString(params); +#ifndef BE_TEST + try { + ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); + Status status; + ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); + if (!status.ok()) { + std::stringstream ss; + ss << "es create client error: address=" << address + << ", msg=" << status.get_error_msg(); + return Status::InternalError(ss.str()); + } + + try { + client->open(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + LOG(WARNING) << "es open retrying, because: " << e.what(); + RETURN_IF_ERROR(client.reopen()); + client->open(result, params); + } + VLOG(1) << "es open result=" << apache::thrift::ThriftDebugString(result); + return Status(result.status); + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es open error: address=" << address << ", msg=" << e.what(); + return Status::InternalError(ss.str()); + } +#else + TStatus status; + result.__set_status(status); + result.__set_scan_handle("0"); + return Status(status); +#endif +} + +// legacy conjuncts must not contain match function +bool EsScanNode::check_left_conjuncts(Expr* conjunct) { + if (is_match_func(conjunct)) { + return false; + } else { + int num_children = conjunct->get_num_children(); + for (int child_idx = 0; child_idx < num_children; ++child_idx) { + if (!check_left_conjuncts(conjunct->get_child(child_idx))) { + return false; + } + } + return true; + } +} + +bool EsScanNode::ignore_cast(SlotDescriptor* slot, Expr* expr) { + if (slot->type().is_date_type() && expr->type().is_date_type()) { + return true; + } + if (slot->type().is_string_type() && expr->type().is_string_type()) { + return true; + } + return false; +} + +bool EsScanNode::get_disjuncts(ExprContext* context, Expr* conjunct, + vector& disjuncts) { + if (TExprNodeType::BINARY_PRED == conjunct->node_type()) { + if (conjunct->children().size() != 2) { + VLOG(1) << "get disjuncts fail: number of childs is not 2"; + return false; + } + SlotRef* slotRef; + TExprOpcode::type op; + Expr* expr; + if (TExprNodeType::SLOT_REF == conjunct->get_child(0)->node_type()) { + expr = conjunct->get_child(1); + slotRef = (SlotRef*)(conjunct->get_child(0)); + op = conjunct->op(); + } else if (TExprNodeType::SLOT_REF == conjunct->get_child(1)->node_type()) { + expr = conjunct->get_child(0); + slotRef = (SlotRef*)(conjunct->get_child(1)); + op = conjunct->op(); + } else { + VLOG(1) << "get disjuncts fail: no SLOT_REF child"; + return false; + } + + SlotDescriptor* slot_desc = get_slot_desc(slotRef); + if (slot_desc == nullptr) { + VLOG(1) << "get disjuncts fail: slot_desc is null"; + return false; + } + + TExtLiteral literal; + if (!to_ext_literal(context, expr, &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << expr->node_type(); + return false; + } + + TExtColumnDesc columnDesc; + columnDesc.__set_name(slot_desc->col_name()); + columnDesc.__set_type(slot_desc->type().to_thrift()); + TExtBinaryPredicate binaryPredicate; + binaryPredicate.__set_col(columnDesc); + binaryPredicate.__set_op(op); + binaryPredicate.__set_value(std::move(literal)); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::BINARY_PRED); + predicate.__set_binary_predicate(binaryPredicate); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (is_match_func(conjunct)) { + // if this is a function call expr and function name is match, then push + // down it to es + TExtFunction match_function; + match_function.__set_func_name(conjunct->fn().name.function_name); + vector query_conditions; + + + TExtLiteral literal; + if (!to_ext_literal(context, conjunct->get_child(1), &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << conjunct->get_child(1)->node_type(); + return false; + } + + query_conditions.push_back(std::move(literal)); + match_function.__set_values(query_conditions); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::FUNCTION_CALL); + predicate.__set_ext_function(match_function); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (TExprNodeType::IN_PRED == conjunct->node_type()) { + // the op code maybe FILTER_NEW_IN, it means there is function in list + // like col_a in (abs(1)) + if (TExprOpcode::FILTER_IN != conjunct->op() + && TExprOpcode::FILTER_NOT_IN != conjunct->op()) { + return false; + } + TExtInPredicate ext_in_predicate; + vector in_pred_values; + InPredicate* pred = dynamic_cast(conjunct); + ext_in_predicate.__set_is_not_in(pred->is_not_in()); + if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { + return false; + } + + SlotRef* slot_ref = (SlotRef*)(conjunct->get_child(0)); + SlotDescriptor* slot_desc = get_slot_desc(slot_ref); + if (slot_desc == nullptr) { + return false; + } + TExtColumnDesc columnDesc; + columnDesc.__set_name(slot_desc->col_name()); + columnDesc.__set_type(slot_desc->type().to_thrift()); + ext_in_predicate.__set_col(columnDesc); + + if (pred->get_child(0)->type().type != slot_desc->type().type) { + if (!ignore_cast(slot_desc, pred->get_child(0))) { + return false; + } + } + + HybirdSetBase::IteratorBase* iter = pred->hybird_set()->begin(); + while (iter->has_next()) { + if (nullptr == iter->get_value()) { + return false; + } + TExtLiteral literal; + if (!to_ext_literal(slot_desc->type().type, const_cast(iter->get_value()), &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << slot_desc->type().type; + return false; + } + in_pred_values.push_back(literal); + iter->next(); + } + ext_in_predicate.__set_values(in_pred_values); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::IN_PRED); + predicate.__set_in_predicate(ext_in_predicate); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (TExprNodeType::COMPOUND_PRED == conjunct->node_type()) { + if (TExprOpcode::COMPOUND_OR != conjunct->op()) { + VLOG(1) << "get disjuncts fail: op is not COMPOUND_OR"; + return false; + } + if (!get_disjuncts(context, conjunct->get_child(0), disjuncts)) { + return false; + } + if (!get_disjuncts(context, conjunct->get_child(1), disjuncts)) { + return false; + } + return true; + } else { + VLOG(1) << "get disjuncts fail: node type is " << conjunct->node_type() + << ", should be BINARY_PRED or COMPOUND_PRED"; + return false; + } +} + +bool EsScanNode::is_match_func(Expr* conjunct) { + if (TExprNodeType::FUNCTION_CALL == conjunct->node_type() + && conjunct->fn().name.function_name == "esquery") { + return true; + } + return false; +} + +SlotDescriptor* EsScanNode::get_slot_desc(SlotRef* slotRef) { + std::vector slot_ids; + slotRef->get_slot_ids(&slot_ids); + SlotDescriptor* slot_desc = nullptr; + for (SlotDescriptor* slot : _tuple_desc->slots()) { + if (slot->id() == slot_ids[0]) { + slot_desc = slot; + break; + } + } + return slot_desc; +} + +bool EsScanNode::to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal) { + switch (expr->node_type()) { + case TExprNodeType::BOOL_LITERAL: + case TExprNodeType::INT_LITERAL: + case TExprNodeType::LARGE_INT_LITERAL: + case TExprNodeType::FLOAT_LITERAL: + case TExprNodeType::DECIMAL_LITERAL: + case TExprNodeType::STRING_LITERAL: + case TExprNodeType::DATE_LITERAL: + return to_ext_literal(expr->type().type, context->get_value(expr, NULL), literal); + default: + return false; + } +} + +bool EsScanNode::to_ext_literal(PrimitiveType slot_type, void* value, TExtLiteral* literal) { + TExprNodeType::type node_type; + switch (slot_type) { + case TYPE_BOOLEAN: { + node_type = (TExprNodeType::BOOL_LITERAL); + TBoolLiteral bool_literal; + bool_literal.__set_value(*reinterpret_cast(value)); + literal->__set_bool_literal(bool_literal); + break; + } + + case TYPE_TINYINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_SMALLINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_INT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_BIGINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + + case TYPE_LARGEINT: { + node_type = (TExprNodeType::LARGE_INT_LITERAL); + char buf[48]; + int len = 48; + char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(value), buf, &len); + TLargeIntLiteral large_int_literal; + large_int_literal.__set_value(v); + literal->__set_large_int_literal(large_int_literal); + break; + } + + case TYPE_FLOAT: { + node_type = (TExprNodeType::FLOAT_LITERAL); + TFloatLiteral float_literal; + float_literal.__set_value(*reinterpret_cast(value)); + literal->__set_float_literal(float_literal); + break; + } + case TYPE_DOUBLE: { + node_type = (TExprNodeType::FLOAT_LITERAL); + TFloatLiteral float_literal; + float_literal.__set_value(*reinterpret_cast(value)); + literal->__set_float_literal(float_literal); + break; + } + + case TYPE_DECIMAL: { + node_type = (TExprNodeType::DECIMAL_LITERAL); + TDecimalLiteral decimal_literal; + decimal_literal.__set_value(reinterpret_cast(value)->to_string()); + literal->__set_decimal_literal(decimal_literal); + break; + } + + case TYPE_DATE: + case TYPE_DATETIME: { + node_type = (TExprNodeType::DATE_LITERAL); + const DateTimeValue date_value = *reinterpret_cast(value); + char str[MAX_DTVALUE_STR_LEN]; + date_value.to_string(str); + TDateLiteral date_literal; + date_literal.__set_value(str); + literal->__set_date_literal(date_literal); + break; + } + + case TYPE_CHAR: + case TYPE_VARCHAR: { + node_type = (TExprNodeType::STRING_LITERAL); + TStringLiteral string_literal; + string_literal.__set_value((reinterpret_cast(value))->debug_string()); + literal->__set_string_literal(string_literal); + break; + } + + default: { + DCHECK(false) << "Invalid type."; + return false; + } + } + literal->__set_node_type(node_type); + return true; +} + +Status EsScanNode::get_next_from_es(TExtGetNextResult& result) { + TExtGetNextParams params; + params.__set_scan_handle(_scan_handles[_scan_range_idx]); + params.__set_offset(_offsets[_scan_range_idx]); + + // getNext + const TNetworkAddress &address = _addresses[_scan_range_idx]; +#ifndef BE_TEST + try { + Status create_client_status; + ExtDataSourceServiceClientCache *client_cache = _env->extdatasource_client_cache(); + ExtDataSourceServiceConnection client(client_cache, address, 10000, &create_client_status); + if (!create_client_status.ok()) { + LOG(WARNING) << "es create client error: scan_range_idx=" << _scan_range_idx + << ", address=" << address + << ", msg=" << create_client_status.get_error_msg(); + return create_client_status; + } + + try { + VLOG(1) << "es get_next param=" << apache::thrift::ThriftDebugString(params); + client->getNext(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + RETURN_IF_ERROR(client.reopen()); + return Status::ThriftRpcError(ss.str()); + } + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + return Status::ThriftRpcError(ss.str()); + } +#else + TStatus status; + result.__set_status(status); + result.__set_eos(true); + TExtColumnData col_data; + std::vector is_null; + is_null.push_back(false); + col_data.__set_is_null(is_null); + std::vector int_vals; + int_vals.push_back(1); + int_vals.push_back(2); + col_data.__set_int_vals(int_vals); + std::vector cols; + cols.push_back(col_data); + TExtRowBatch rows; + rows.__set_cols(cols); + rows.__set_num_rows(2); + result.__set_rows(rows); + return Status(status); +#endif + + // check result + VLOG(1) << "es get_next result=" << apache::thrift::ThriftDebugString(result); + Status get_next_status(result.status); + if (!get_next_status.ok()) { + LOG(WARNING) << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", address=" << address + << ", msg=" << get_next_status.get_error_msg(); + return get_next_status; + } + if (!result.__isset.rows || !result.rows.__isset.num_rows) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=rows or num_rows not in result"; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + return Status::OK(); +} + +Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, + const vector& cols, int row_idx, + vector& cols_next_val_idx) { + tuple->init(_tuple_desc->byte_size()); + + for (int i = 0; i < _tuple_desc->slots().size(); ++i) { + const SlotDescriptor* slot_desc = _tuple_desc->slots()[i]; + + if (!slot_desc->is_materialized()) { + continue; + } + + void* slot = tuple->get_slot(slot_desc->tuple_offset()); + const TExtColumnData& col = cols[i]; + + if (col.is_null[row_idx]) { + tuple->set_null(slot_desc->null_indicator_offset()); + continue; + } else { + tuple->set_not_null(slot_desc->null_indicator_offset()); + } + + int val_idx = cols_next_val_idx[i]++; + switch (slot_desc->type().type) { + case TYPE_CHAR: + case TYPE_VARCHAR: { + if (val_idx >= col.string_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "STRING")); + } + const string& val = col.string_vals[val_idx]; + size_t val_size = val.size(); + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + if (UNLIKELY(buffer == NULL)) { + string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", + val_size, "string slot"); + return tuple_pool->mem_tracker()->MemLimitExceeded(NULL, details, val_size); + } + memcpy(buffer, val.data(), val_size); + reinterpret_cast(slot)->ptr = buffer; + reinterpret_cast(slot)->len = val_size; + break; + } + case TYPE_TINYINT: + if (val_idx >= col.byte_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TINYINT")); + } + *reinterpret_cast(slot) = col.byte_vals[val_idx]; + break; + case TYPE_SMALLINT: + if (val_idx >= col.short_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "SMALLINT")); + } + *reinterpret_cast(slot) = col.short_vals[val_idx]; + break; + case TYPE_INT: + if (val_idx >= col.int_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "INT")); + } + *reinterpret_cast(slot) = col.int_vals[val_idx]; + break; + case TYPE_BIGINT: + if (val_idx >= col.long_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BIGINT")); + } + *reinterpret_cast(slot) = col.long_vals[val_idx]; + break; + case TYPE_LARGEINT: + if (val_idx >= col.long_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "LARGEINT")); + } + *reinterpret_cast(slot) = col.long_vals[val_idx]; + break; + case TYPE_DOUBLE: + if (val_idx >= col.double_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DOUBLE")); + } + *reinterpret_cast(slot) = col.double_vals[val_idx]; + break; + case TYPE_FLOAT: + if (val_idx >= col.double_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "FLOAT")); + } + *reinterpret_cast(slot) = col.double_vals[val_idx]; + break; + case TYPE_BOOLEAN: + if (val_idx >= col.bool_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BOOLEAN")); + } + *reinterpret_cast(slot) = col.bool_vals[val_idx]; + break; + case TYPE_DATE: + if (val_idx >= col.long_vals.size() || + !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATE")); + } + reinterpret_cast(slot)->cast_to_date(); + break; + case TYPE_DATETIME: { + if (val_idx >= col.long_vals.size() || + !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATETIME")); + } + reinterpret_cast(slot)->set_type(TIME_DATETIME); + break; + } + case TYPE_DECIMAL: { + if (val_idx >= col.binary_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DECIMAL")); + } + const string& val = col.binary_vals[val_idx]; + *reinterpret_cast(slot) = *reinterpret_cast(&val); + break; + } + default: + DCHECK(false); + } + } + return Status::OK(); +} + +} diff --git a/be/src/exec/es_scan_node.h b/be/src/exec/es_scan_node.h index 810917d9ff2f72..de871a873171ad 100644 --- a/be/src/exec/es_scan_node.h +++ b/be/src/exec/es_scan_node.h @@ -1,92 +1,92 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H -#define BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H - -#include -#include - -#include "runtime/descriptors.h" -#include "runtime/tuple.h" -#include "exec/scan_node.h" -#include "exprs/slot_ref.h" -#include "runtime/exec_env.h" -#include "gen_cpp/TExtDataSourceService.h" -#include "gen_cpp/PaloExternalDataSourceService_types.h" - -namespace doris { - -class TupleDescriptor; -class RuntimeState; -class Status; - -class EsScanNode : public ScanNode { -public: - EsScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - ~EsScanNode(); - - virtual Status prepare(RuntimeState* state) override; - virtual Status open(RuntimeState* state) override; - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; - virtual Status close(RuntimeState* state) override; - virtual Status set_scan_ranges(const std::vector& scan_ranges) override; - -protected: - // Write debug string of this into out. - virtual void debug_string(int indentation_level, std::stringstream* out) const; - -private: - Status open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params); - Status materialize_row(MemPool* tuple_pool, Tuple* tuple, - const vector& cols, int next_row_idx, - vector& cols_next_val_idx); - Status get_next_from_es(TExtGetNextResult& result); - - bool get_disjuncts(ExprContext* context, Expr* conjunct, vector& disjuncts); - bool to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal); - bool to_ext_literal(PrimitiveType node_type, void* value, TExtLiteral* literal); - bool ignore_cast(SlotDescriptor* slot, Expr* expr); - - bool is_match_func(Expr* conjunct); - - SlotDescriptor* get_slot_desc(SlotRef* slotRef); - - // check if open result meets condition - // 1. check if left conjuncts contain "match" function, since match function could only be executed on es - bool check_left_conjuncts(Expr* conjunct); - -private: - TupleId _tuple_id; - std::map _properties; - const TupleDescriptor* _tuple_desc; - ExecEnv* _env; - std::vector _scan_ranges; - - // scan range's iterator, used in get_next() - int _scan_range_idx; - - // store every scan range's netaddress/handle/offset - std::vector _addresses; - std::vector _scan_handles; - std::vector _offsets; - std::vector _pushdown_conjunct_ctxs; -}; - -} - -#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H +#define BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H + +#include +#include + +#include "runtime/descriptors.h" +#include "runtime/tuple.h" +#include "exec/scan_node.h" +#include "exprs/slot_ref.h" +#include "runtime/exec_env.h" +#include "gen_cpp/TExtDataSourceService.h" +#include "gen_cpp/PaloExternalDataSourceService_types.h" + +namespace doris { + +class TupleDescriptor; +class RuntimeState; +class Status; + +class EsScanNode : public ScanNode { +public: + EsScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); + ~EsScanNode(); + + virtual Status prepare(RuntimeState* state) override; + virtual Status open(RuntimeState* state) override; + virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; + virtual Status close(RuntimeState* state) override; + virtual Status set_scan_ranges(const std::vector& scan_ranges) override; + +protected: + // Write debug string of this into out. + virtual void debug_string(int indentation_level, std::stringstream* out) const; + +private: + Status open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params); + Status materialize_row(MemPool* tuple_pool, Tuple* tuple, + const vector& cols, int next_row_idx, + vector& cols_next_val_idx); + Status get_next_from_es(TExtGetNextResult& result); + + bool get_disjuncts(ExprContext* context, Expr* conjunct, vector& disjuncts); + bool to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal); + bool to_ext_literal(PrimitiveType node_type, void* value, TExtLiteral* literal); + bool ignore_cast(SlotDescriptor* slot, Expr* expr); + + bool is_match_func(Expr* conjunct); + + SlotDescriptor* get_slot_desc(SlotRef* slotRef); + + // check if open result meets condition + // 1. check if left conjuncts contain "match" function, since match function could only be executed on es + bool check_left_conjuncts(Expr* conjunct); + +private: + TupleId _tuple_id; + std::map _properties; + const TupleDescriptor* _tuple_desc; + ExecEnv* _env; + std::vector _scan_ranges; + + // scan range's iterator, used in get_next() + int _scan_range_idx; + + // store every scan range's netaddress/handle/offset + std::vector _addresses; + std::vector _scan_handles; + std::vector _offsets; + std::vector _pushdown_conjunct_ctxs; +}; + +} + +#endif diff --git a/be/src/gutil/cpu.cc b/be/src/gutil/cpu.cc index f43664aee78fbc..c02f5e5949ed43 100644 --- a/be/src/gutil/cpu.cc +++ b/be/src/gutil/cpu.cc @@ -1,286 +1,286 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "gutil/cpu.h" - -#include -#include - -#include "gutil/integral_types.h" - -#if defined(__x86_64__) -#if defined(_MSC_VER) -#include -#include // For _xgetbv() -#endif -#endif - -namespace base { - -CPU::CPU() - : signature_(0), - type_(0), - family_(0), - model_(0), - stepping_(0), - ext_model_(0), - ext_family_(0), - has_mmx_(false), - has_sse_(false), - has_sse2_(false), - has_sse3_(false), - has_ssse3_(false), - has_sse41_(false), - has_sse42_(false), - has_avx_(false), - has_avx2_(false), - has_aesni_(false), - has_non_stop_time_stamp_counter_(false), - has_broken_neon_(false), - cpu_vendor_("unknown") { - Initialize(); -} - -namespace { - -#if defined(__x86_64__) -#ifndef _MSC_VER - -#if defined(__pic__) && defined(__i386__) - -void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); -} - -#else - -void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "cpuid\n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type), "c"(0) - ); -} - -#endif - -// _xgetbv returns the value of an Intel Extended Control Register (XCR). -// Currently only XCR0 is defined by Intel so |xcr| should always be zero. -uint64 _xgetbv(uint32 xcr) { - uint32 eax, edx; - - __asm__ volatile ( - "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (static_cast(edx) << 32) | eax; -} - -#endif // !_MSC_VER -#endif // __x86_64__ - -#if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) -class LazyCpuInfoValue { - public: - LazyCpuInfoValue() : has_broken_neon_(false) { - // This function finds the value from /proc/cpuinfo under the key "model - // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 - // and later for arm64) and is shown once per CPU. "Processor" is used in - // earler versions and is shown only once at the top of /proc/cpuinfo - // regardless of the number CPUs. - const char kModelNamePrefix[] = "model name\t: "; - const char kProcessorPrefix[] = "Processor\t: "; - - // This function also calculates whether we believe that this CPU has a - // broken NEON unit based on these fields from cpuinfo: - unsigned implementer = 0, architecture = 0, variant = 0, part = 0, - revision = 0; - const struct { - const char key[17]; - unsigned int* result; - } kUnsignedValues[] = { - {"CPU implementer", &implementer}, - {"CPU architecture", &architecture}, - {"CPU variant", &variant}, - {"CPU part", &part}, - {"CPU revision", &revision}, - }; - - std::string contents; - ReadFileToString(FilePath("/proc/cpuinfo"), &contents); - DCHECK(!contents.empty()); - if (contents.empty()) { - return; - } - - std::istringstream iss(contents); - std::string line; - while (std::getline(iss, line)) { - if (brand_.empty() && - (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || - line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { - brand_.assign(line.substr(strlen(kModelNamePrefix))); - } - - for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { - const char *key = kUnsignedValues[i].key; - const size_t len = strlen(key); - - if (line.compare(0, len, key) == 0 && - line.size() >= len + 1 && - (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { - size_t colon_pos = line.find(':', len); - if (colon_pos == std::string::npos) { - continue; - } - - const StringPiece line_sp(line); - StringPiece value_sp = line_sp.substr(colon_pos + 1); - while (!value_sp.empty() && - (value_sp[0] == ' ' || value_sp[0] == '\t')) { - value_sp = value_sp.substr(1); - } - - // The string may have leading "0x" or not, so we use strtoul to - // handle that. - char* endptr; - std::string value(value_sp.as_string()); - unsigned long int result = strtoul(value.c_str(), &endptr, 0); - if (*endptr == 0 && result <= UINT_MAX) { - *kUnsignedValues[i].result = result; - } - } - } - } - - has_broken_neon_ = - implementer == 0x51 && - architecture == 7 && - variant == 1 && - part == 0x4d && - revision == 0; - } - - const std::string& brand() const { return brand_; } - bool has_broken_neon() const { return has_broken_neon_; } - - private: - std::string brand_; - bool has_broken_neon_; - DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); -}; - -base::LazyInstance::Leaky g_lazy_cpuinfo = - LAZY_INSTANCE_INITIALIZER; - -#endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || - // defined(OS_LINUX)) - -} // anonymous namespace - -void CPU::Initialize() { -#if defined(__x86_64__) - int cpu_info[4] = {-1}; - char cpu_string[48]; - - // __cpuid with an InfoType argument of 0 returns the number of - // valid Ids in CPUInfo[0] and the CPU identification string in - // the other three array elements. The CPU identification string is - // not in linear order. The code below arranges the information - // in a human readable form. The human readable order is CPUInfo[1] | - // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped - // before using memcpy to copy these three array elements to cpu_string. - __cpuid(cpu_info, 0); - int num_ids = cpu_info[0]; - std::swap(cpu_info[2], cpu_info[3]); - memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); - cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); - - // Interpret CPU feature information. - if (num_ids > 0) { - int cpu_info7[4] = {0}; - __cpuid(cpu_info, 1); - if (num_ids >= 7) { - __cpuid(cpu_info7, 7); - } - signature_ = cpu_info[0]; - stepping_ = cpu_info[0] & 0xf; - model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); - family_ = (cpu_info[0] >> 8) & 0xf; - type_ = (cpu_info[0] >> 12) & 0x3; - ext_model_ = (cpu_info[0] >> 16) & 0xf; - ext_family_ = (cpu_info[0] >> 20) & 0xff; - has_mmx_ = (cpu_info[3] & 0x00800000) != 0; - has_sse_ = (cpu_info[3] & 0x02000000) != 0; - has_sse2_ = (cpu_info[3] & 0x04000000) != 0; - has_sse3_ = (cpu_info[2] & 0x00000001) != 0; - has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; - has_sse41_ = (cpu_info[2] & 0x00080000) != 0; - has_sse42_ = (cpu_info[2] & 0x00100000) != 0; - // AVX instructions will generate an illegal instruction exception unless - // a) they are supported by the CPU, - // b) XSAVE is supported by the CPU and - // c) XSAVE is enabled by the kernel. - // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled - // - // In addition, we have observed some crashes with the xgetbv instruction - // even after following Intel's example code. (See crbug.com/375968.) - // Because of that, we also test the XSAVE bit because its description in - // the CPUID documentation suggests that it signals xgetbv support. - has_avx_ = - (cpu_info[2] & 0x10000000) != 0 && - (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && - (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && - (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; - has_aesni_ = (cpu_info[2] & 0x02000000) != 0; - has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; - } - - // Get the brand string of the cpu. - __cpuid(cpu_info, 0x80000000); - const int parameter_end = 0x80000004; - int max_parameter = cpu_info[0]; - - if (cpu_info[0] >= parameter_end) { - char* cpu_string_ptr = cpu_string; - - for (int parameter = 0x80000002; parameter <= parameter_end && - cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { - __cpuid(cpu_info, parameter); - memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); - cpu_string_ptr += sizeof(cpu_info); - } - cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); - } - - const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; - if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { - __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); - has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; - } -#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) - cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); - has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); -#else - #error unknown architecture -#endif -} - -CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { - if (has_avx2()) return AVX2; - if (has_avx()) return AVX; - if (has_sse42()) return SSE42; - if (has_sse41()) return SSE41; - if (has_ssse3()) return SSSE3; - if (has_sse3()) return SSE3; - if (has_sse2()) return SSE2; - if (has_sse()) return SSE; - return PENTIUM; -} - -} // namespace base +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gutil/cpu.h" + +#include +#include + +#include "gutil/integral_types.h" + +#if defined(__x86_64__) +#if defined(_MSC_VER) +#include +#include // For _xgetbv() +#endif +#endif + +namespace base { + +CPU::CPU() + : signature_(0), + type_(0), + family_(0), + model_(0), + stepping_(0), + ext_model_(0), + ext_family_(0), + has_mmx_(false), + has_sse_(false), + has_sse2_(false), + has_sse3_(false), + has_ssse3_(false), + has_sse41_(false), + has_sse42_(false), + has_avx_(false), + has_avx2_(false), + has_aesni_(false), + has_non_stop_time_stamp_counter_(false), + has_broken_neon_(false), + cpu_vendor_("unknown") { + Initialize(); +} + +namespace { + +#if defined(__x86_64__) +#ifndef _MSC_VER + +#if defined(__pic__) && defined(__i386__) + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} + +#else + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type), "c"(0) + ); +} + +#endif + +// _xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +uint64 _xgetbv(uint32 xcr) { + uint32 eax, edx; + + __asm__ volatile ( + "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (static_cast(edx) << 32) | eax; +} + +#endif // !_MSC_VER +#endif // __x86_64__ + +#if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) +class LazyCpuInfoValue { + public: + LazyCpuInfoValue() : has_broken_neon_(false) { + // This function finds the value from /proc/cpuinfo under the key "model + // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 + // and later for arm64) and is shown once per CPU. "Processor" is used in + // earler versions and is shown only once at the top of /proc/cpuinfo + // regardless of the number CPUs. + const char kModelNamePrefix[] = "model name\t: "; + const char kProcessorPrefix[] = "Processor\t: "; + + // This function also calculates whether we believe that this CPU has a + // broken NEON unit based on these fields from cpuinfo: + unsigned implementer = 0, architecture = 0, variant = 0, part = 0, + revision = 0; + const struct { + const char key[17]; + unsigned int* result; + } kUnsignedValues[] = { + {"CPU implementer", &implementer}, + {"CPU architecture", &architecture}, + {"CPU variant", &variant}, + {"CPU part", &part}, + {"CPU revision", &revision}, + }; + + std::string contents; + ReadFileToString(FilePath("/proc/cpuinfo"), &contents); + DCHECK(!contents.empty()); + if (contents.empty()) { + return; + } + + std::istringstream iss(contents); + std::string line; + while (std::getline(iss, line)) { + if (brand_.empty() && + (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || + line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { + brand_.assign(line.substr(strlen(kModelNamePrefix))); + } + + for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { + const char *key = kUnsignedValues[i].key; + const size_t len = strlen(key); + + if (line.compare(0, len, key) == 0 && + line.size() >= len + 1 && + (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { + size_t colon_pos = line.find(':', len); + if (colon_pos == std::string::npos) { + continue; + } + + const StringPiece line_sp(line); + StringPiece value_sp = line_sp.substr(colon_pos + 1); + while (!value_sp.empty() && + (value_sp[0] == ' ' || value_sp[0] == '\t')) { + value_sp = value_sp.substr(1); + } + + // The string may have leading "0x" or not, so we use strtoul to + // handle that. + char* endptr; + std::string value(value_sp.as_string()); + unsigned long int result = strtoul(value.c_str(), &endptr, 0); + if (*endptr == 0 && result <= UINT_MAX) { + *kUnsignedValues[i].result = result; + } + } + } + } + + has_broken_neon_ = + implementer == 0x51 && + architecture == 7 && + variant == 1 && + part == 0x4d && + revision == 0; + } + + const std::string& brand() const { return brand_; } + bool has_broken_neon() const { return has_broken_neon_; } + + private: + std::string brand_; + bool has_broken_neon_; + DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); +}; + +base::LazyInstance::Leaky g_lazy_cpuinfo = + LAZY_INSTANCE_INITIALIZER; + +#endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || + // defined(OS_LINUX)) + +} // anonymous namespace + +void CPU::Initialize() { +#if defined(__x86_64__) + int cpu_info[4] = {-1}; + char cpu_string[48]; + + // __cpuid with an InfoType argument of 0 returns the number of + // valid Ids in CPUInfo[0] and the CPU identification string in + // the other three array elements. The CPU identification string is + // not in linear order. The code below arranges the information + // in a human readable form. The human readable order is CPUInfo[1] | + // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped + // before using memcpy to copy these three array elements to cpu_string. + __cpuid(cpu_info, 0); + int num_ids = cpu_info[0]; + std::swap(cpu_info[2], cpu_info[3]); + memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); + cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); + + // Interpret CPU feature information. + if (num_ids > 0) { + int cpu_info7[4] = {0}; + __cpuid(cpu_info, 1); + if (num_ids >= 7) { + __cpuid(cpu_info7, 7); + } + signature_ = cpu_info[0]; + stepping_ = cpu_info[0] & 0xf; + model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); + family_ = (cpu_info[0] >> 8) & 0xf; + type_ = (cpu_info[0] >> 12) & 0x3; + ext_model_ = (cpu_info[0] >> 16) & 0xf; + ext_family_ = (cpu_info[0] >> 20) & 0xff; + has_mmx_ = (cpu_info[3] & 0x00800000) != 0; + has_sse_ = (cpu_info[3] & 0x02000000) != 0; + has_sse2_ = (cpu_info[3] & 0x04000000) != 0; + has_sse3_ = (cpu_info[2] & 0x00000001) != 0; + has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; + has_sse41_ = (cpu_info[2] & 0x00080000) != 0; + has_sse42_ = (cpu_info[2] & 0x00100000) != 0; + // AVX instructions will generate an illegal instruction exception unless + // a) they are supported by the CPU, + // b) XSAVE is supported by the CPU and + // c) XSAVE is enabled by the kernel. + // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled + // + // In addition, we have observed some crashes with the xgetbv instruction + // even after following Intel's example code. (See crbug.com/375968.) + // Because of that, we also test the XSAVE bit because its description in + // the CPUID documentation suggests that it signals xgetbv support. + has_avx_ = + (cpu_info[2] & 0x10000000) != 0 && + (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && + (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && + (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; + has_aesni_ = (cpu_info[2] & 0x02000000) != 0; + has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; + } + + // Get the brand string of the cpu. + __cpuid(cpu_info, 0x80000000); + const int parameter_end = 0x80000004; + int max_parameter = cpu_info[0]; + + if (cpu_info[0] >= parameter_end) { + char* cpu_string_ptr = cpu_string; + + for (int parameter = 0x80000002; parameter <= parameter_end && + cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { + __cpuid(cpu_info, parameter); + memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); + cpu_string_ptr += sizeof(cpu_info); + } + cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); + } + + const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; + if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { + __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); + has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; + } +#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) + cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); + has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); +#else + #error unknown architecture +#endif +} + +CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { + if (has_avx2()) return AVX2; + if (has_avx()) return AVX; + if (has_sse42()) return SSE42; + if (has_sse41()) return SSE41; + if (has_ssse3()) return SSSE3; + if (has_sse3()) return SSE3; + if (has_sse2()) return SSE2; + if (has_sse()) return SSE; + return PENTIUM; +} + +} // namespace base diff --git a/be/src/gutil/cpu.h b/be/src/gutil/cpu.h index b401867c3cee34..65498140d172ba 100644 --- a/be/src/gutil/cpu.h +++ b/be/src/gutil/cpu.h @@ -1,90 +1,90 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef BASE_CPU_H_ -#define BASE_CPU_H_ - -#include - -namespace base { - -// Query information about the processor. -class CPU { - public: - // Constructor - CPU(); - - enum IntelMicroArchitecture { - PENTIUM, - SSE, - SSE2, - SSE3, - SSSE3, - SSE41, - SSE42, - AVX, - AVX2, - MAX_INTEL_MICRO_ARCHITECTURE - }; - - // Accessors for CPU information. - const std::string& vendor_name() const { return cpu_vendor_; } - int signature() const { return signature_; } - int stepping() const { return stepping_; } - int model() const { return model_; } - int family() const { return family_; } - int type() const { return type_; } - int extended_model() const { return ext_model_; } - int extended_family() const { return ext_family_; } - bool has_mmx() const { return has_mmx_; } - bool has_sse() const { return has_sse_; } - bool has_sse2() const { return has_sse2_; } - bool has_sse3() const { return has_sse3_; } - bool has_ssse3() const { return has_ssse3_; } - bool has_sse41() const { return has_sse41_; } - bool has_sse42() const { return has_sse42_; } - bool has_avx() const { return has_avx_; } - bool has_avx2() const { return has_avx2_; } - bool has_aesni() const { return has_aesni_; } - bool has_non_stop_time_stamp_counter() const { - return has_non_stop_time_stamp_counter_; - } - // has_broken_neon is only valid on ARM chips. If true, it indicates that we - // believe that the NEON unit on the current CPU is flawed and cannot execute - // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 - bool has_broken_neon() const { return has_broken_neon_; } - - IntelMicroArchitecture GetIntelMicroArchitecture() const; - const std::string& cpu_brand() const { return cpu_brand_; } - - private: - // Query the processor for CPUID information. - void Initialize(); - - int signature_; // raw form of type, family, model, and stepping - int type_; // process type - int family_; // family of the processor - int model_; // model of processor - int stepping_; // processor revision number - int ext_model_; - int ext_family_; - bool has_mmx_; - bool has_sse_; - bool has_sse2_; - bool has_sse3_; - bool has_ssse3_; - bool has_sse41_; - bool has_sse42_; - bool has_avx_; - bool has_avx2_; - bool has_aesni_; - bool has_non_stop_time_stamp_counter_; - bool has_broken_neon_; - std::string cpu_vendor_; - std::string cpu_brand_; -}; - -} // namespace base - -#endif // BASE_CPU_H_ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_CPU_H_ +#define BASE_CPU_H_ + +#include + +namespace base { + +// Query information about the processor. +class CPU { + public: + // Constructor + CPU(); + + enum IntelMicroArchitecture { + PENTIUM, + SSE, + SSE2, + SSE3, + SSSE3, + SSE41, + SSE42, + AVX, + AVX2, + MAX_INTEL_MICRO_ARCHITECTURE + }; + + // Accessors for CPU information. + const std::string& vendor_name() const { return cpu_vendor_; } + int signature() const { return signature_; } + int stepping() const { return stepping_; } + int model() const { return model_; } + int family() const { return family_; } + int type() const { return type_; } + int extended_model() const { return ext_model_; } + int extended_family() const { return ext_family_; } + bool has_mmx() const { return has_mmx_; } + bool has_sse() const { return has_sse_; } + bool has_sse2() const { return has_sse2_; } + bool has_sse3() const { return has_sse3_; } + bool has_ssse3() const { return has_ssse3_; } + bool has_sse41() const { return has_sse41_; } + bool has_sse42() const { return has_sse42_; } + bool has_avx() const { return has_avx_; } + bool has_avx2() const { return has_avx2_; } + bool has_aesni() const { return has_aesni_; } + bool has_non_stop_time_stamp_counter() const { + return has_non_stop_time_stamp_counter_; + } + // has_broken_neon is only valid on ARM chips. If true, it indicates that we + // believe that the NEON unit on the current CPU is flawed and cannot execute + // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 + bool has_broken_neon() const { return has_broken_neon_; } + + IntelMicroArchitecture GetIntelMicroArchitecture() const; + const std::string& cpu_brand() const { return cpu_brand_; } + + private: + // Query the processor for CPUID information. + void Initialize(); + + int signature_; // raw form of type, family, model, and stepping + int type_; // process type + int family_; // family of the processor + int model_; // model of processor + int stepping_; // processor revision number + int ext_model_; + int ext_family_; + bool has_mmx_; + bool has_sse_; + bool has_sse2_; + bool has_sse3_; + bool has_ssse3_; + bool has_sse41_; + bool has_sse42_; + bool has_avx_; + bool has_avx2_; + bool has_aesni_; + bool has_non_stop_time_stamp_counter_; + bool has_broken_neon_; + std::string cpu_vendor_; + std::string cpu_brand_; +}; + +} // namespace base + +#endif // BASE_CPU_H_ diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index e4737e402b3222..5bfa28c34813fa 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -1,74 +1,74 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H -#define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H - -#include "gen_cpp/olap_file.pb.h" -#include "olap/data_dir.h" -#include "olap/tablet_schema.h" - -namespace doris { - -class RowsetWriterContextBuilder; -using RowsetWriterContextBuilderSharedPtr = std::shared_ptr; - -struct RowsetWriterContext { - RowsetWriterContext() : - tablet_id(0), - tablet_schema_hash(0), - partition_id(0), - rowset_type(ALPHA_ROWSET), - rowset_path_prefix(""), - tablet_schema(nullptr), - rowset_state(PREPARED), - data_dir(nullptr), - version(Version(0, 0)), - version_hash(0), - txn_id(0), - tablet_uid(0, 0) { - load_id.set_hi(0); - load_id.set_lo(0); - } - RowsetId rowset_id; - int64_t tablet_id; - int64_t tablet_schema_hash; - int64_t partition_id; - RowsetTypePB rowset_type; - std::string rowset_path_prefix; - const TabletSchema* tablet_schema; - // PREPARED/COMMITTED for pending rowset - // VISIBLE for non-pending rowset - RowsetStatePB rowset_state; - DataDir* data_dir; - // properties for non-pending rowset - Version version; - VersionHash version_hash; - - // properties for pending rowset - int64_t txn_id; - PUniqueId load_id; - TabletUid tablet_uid; - // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. - // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. - // test cases can change this value to control flush timing - uint32_t max_rows_per_segment = INT32_MAX; -}; - -} // namespace doris - -#endif // DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H +#define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H + +#include "gen_cpp/olap_file.pb.h" +#include "olap/data_dir.h" +#include "olap/tablet_schema.h" + +namespace doris { + +class RowsetWriterContextBuilder; +using RowsetWriterContextBuilderSharedPtr = std::shared_ptr; + +struct RowsetWriterContext { + RowsetWriterContext() : + tablet_id(0), + tablet_schema_hash(0), + partition_id(0), + rowset_type(ALPHA_ROWSET), + rowset_path_prefix(""), + tablet_schema(nullptr), + rowset_state(PREPARED), + data_dir(nullptr), + version(Version(0, 0)), + version_hash(0), + txn_id(0), + tablet_uid(0, 0) { + load_id.set_hi(0); + load_id.set_lo(0); + } + RowsetId rowset_id; + int64_t tablet_id; + int64_t tablet_schema_hash; + int64_t partition_id; + RowsetTypePB rowset_type; + std::string rowset_path_prefix; + const TabletSchema* tablet_schema; + // PREPARED/COMMITTED for pending rowset + // VISIBLE for non-pending rowset + RowsetStatePB rowset_state; + DataDir* data_dir; + // properties for non-pending rowset + Version version; + VersionHash version_hash; + + // properties for pending rowset + int64_t txn_id; + PUniqueId load_id; + TabletUid tablet_uid; + // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. + // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. + // test cases can change this value to control flush timing + uint32_t max_rows_per_segment = INT32_MAX; +}; + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h index 6a9bd1ae98fbcc..1573d681f008ed 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -1,342 +1,342 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "util/coding.h" -#include "util/faststring.h" -#include "gutil/port.h" -#include "olap/olap_common.h" -#include "olap/types.h" -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/common.h" -#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" - -namespace doris { -namespace segment_v2 { - -enum { - BITSHUFFLE_PAGE_HEADER_SIZE = 16 -}; - -void warn_with_bitshuffle_error(int64_t val); - -// BitshufflePageBuilder bitshuffles and compresses the bits of fixed -// size type blocks with lz4. -// -// The page format is as follows: -// -// 1. Header: (16 bytes total) -// -// [32-bit] -// The number of elements encoded in the page. -// -// [32-bit] -// The post-compression size of the page, including this header. -// -// [32-bit] -// Padding is needed to meet the requirements of the bitshuffle -// library such that the input/output is a multiple of 8. Some -// ignored elements are appended to the end of the page if necessary -// to meet this requirement. -// -// This header field is the post-padding element count. -// -// [32-bit] -// The size of the elements, in bytes, as actually encoded. In the -// case that all of the data in a page can fit into a smaller -// integer type, then we may choose to encode that smaller type -// to save CPU costs. -// -// This is currently only implemented in the UINT32 page type. -// -// NOTE: all on-disk ints are encoded little-endian -// -// 2. Element data -// -// The header is followed by the bitshuffle-compressed element data. -// -template -class BitshufflePageBuilder : public PageBuilder { -public: - BitshufflePageBuilder(const PageBuilderOptions& options) : - _options(options), - _count(0), - _remain_element_capacity(0), - _finished(false) { - reset(); - } - - bool is_page_full() override { - return _remain_element_capacity == 0; - } - - Status add(const uint8_t* vals, size_t* count) override { - DCHECK(!_finished); - int to_add = std::min(_remain_element_capacity, *count); - _data.append(vals, to_add * SIZE_OF_TYPE); - _count += to_add; - _remain_element_capacity -= to_add; - // return added number through count - *count = to_add; - return Status::OK(); - } - - Slice finish() override { - return _finish(SIZE_OF_TYPE); - } - - void reset() override { - auto block_size = _options.data_page_size; - _count = 0; - _data.clear(); - _data.reserve(block_size); - DCHECK_EQ(reinterpret_cast(_data.data()) & (alignof(CppType) - 1), 0) - << "buffer must be naturally-aligned"; - _buffer.clear(); - _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE); - _finished = false; - _remain_element_capacity = block_size / SIZE_OF_TYPE; - } - - size_t count() const { - return _count; - } - - uint64_t size() const override { - return _buffer.size(); - } - - // this api will release the memory ownership of encoded data - // Note: - // release() should be called after finish - // reset() should be called after this function before reuse the builder - void release() override { - uint8_t* ret = _buffer.release(); - (void)ret; - } - -private: - Slice _finish(int final_size_of_type) { - _data.resize(final_size_of_type * _count); - - // Do padding so that the input num of element is multiple of 8. - int num_elems_after_padding = ALIGN_UP(_count, 8); - int padding_elems = num_elems_after_padding - _count; - int padding_bytes = padding_elems * final_size_of_type; - for (int i = 0; i < padding_bytes; i++) { - _data.push_back(0); - } - - _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE + - bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); - - encode_fixed32_le(&_buffer[0], _count); - int64_t bytes = bitshuffle::compress_lz4(_data.data(), &_buffer[BITSHUFFLE_PAGE_HEADER_SIZE], - num_elems_after_padding, final_size_of_type, 0); - if (PREDICT_FALSE(bytes < 0)) { - // This means the bitshuffle function fails. - // Ideally, this should not happen. - warn_with_bitshuffle_error(bytes); - // It does not matter what will be returned here, - // since we have logged fatal in warn_with_bitshuffle_error(). - return Slice(); - } - encode_fixed32_le(&_buffer[4], BITSHUFFLE_PAGE_HEADER_SIZE + bytes); - encode_fixed32_le(&_buffer[8], num_elems_after_padding); - encode_fixed32_le(&_buffer[12], final_size_of_type); - _finished = true; - return Slice(_buffer.data(), BITSHUFFLE_PAGE_HEADER_SIZE + bytes); - } - - typedef typename TypeTraits::CppType CppType; - - CppType cell(int idx) const { - DCHECK_GE(idx, 0); - CppType ret; - memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); - return ret; - } - - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - PageBuilderOptions _options; - uint32_t _count; - int _remain_element_capacity; - bool _finished; - faststring _data; - faststring _buffer; -}; - -template -class BitShufflePageDecoder : public PageDecoder { -public: - BitShufflePageDecoder(Slice data, const PageDecoderOptions& options) : _data(data), - _options(options), - _parsed(false), - _num_elements(0), - _compressed_size(0), - _num_element_after_padding(0), - _size_of_element(0), - _cur_index(0) { } - - Status init() override { - CHECK(!_parsed); - if (_data.size < BITSHUFFLE_PAGE_HEADER_SIZE) { - std::stringstream ss; - ss << "file corrupton: invalid data size:" << _data.size << ", header size:" << BITSHUFFLE_PAGE_HEADER_SIZE; - return Status::InternalError(ss.str()); - } - _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); - _compressed_size = decode_fixed32_le((const uint8_t*)&_data[4]); - if (_compressed_size != _data.size) { - std::stringstream ss; - ss << "Size information unmatched, _compressed_size:" << _compressed_size - << ", _num_elements:" << _num_elements - << ", data size:" << _data.size; - return Status::InternalError(ss.str()); - } - _num_element_after_padding = decode_fixed32_le((const uint8_t*)&_data[8]); - if (_num_element_after_padding != ALIGN_UP(_num_elements, 8)) { - std::stringstream ss; - ss << "num of element information corrupted," - << " _num_element_after_padding:" << _num_element_after_padding - << ", _num_elements:" << _num_elements; - return Status::InternalError(ss.str()); - } - _size_of_element = decode_fixed32_le((const uint8_t*)&_data[12]); - switch (_size_of_element) { - case 1: - case 2: - case 3: - case 4: - case 8: - case 12: - case 16: - break; - default: - std::stringstream ss; - ss << "invalid size_of_elem:" << _size_of_element; - return Status::InternalError(ss.str()); - } - - // Currently, only the UINT32 block encoder supports expanding size: - if (UNLIKELY(Type != OLAP_FIELD_TYPE_UNSIGNED_INT && _size_of_element != SIZE_OF_TYPE)) { - std::stringstream ss; - ss << "invalid size info. size of element:" << _size_of_element - << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE - << ", type:" << Type; - return Status::InternalError(ss.str()); - } - if (UNLIKELY(_size_of_element > SIZE_OF_TYPE)) { - std::stringstream ss; - ss << "invalid size info. size of element:" << _size_of_element - << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE; - return Status::InternalError(ss.str()); - } - - RETURN_IF_ERROR(_decode()); - _parsed = true; - return Status::OK(); - } - - Status seek_to_position_in_page(size_t pos) override { - DCHECK(_parsed) << "Must call init()"; - if (PREDICT_FALSE(_num_elements == 0)) { - DCHECK_EQ(0, pos); - return Status::InvalidArgument("invalid pos"); - } - - DCHECK_LE(pos, _num_elements); - _cur_index = pos; - return Status::OK(); - } - - Status next_batch(size_t* n, ColumnBlockView* dst) override { - DCHECK(_parsed); - if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { - *n = 0; - return Status::OK(); - } - - size_t max_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); - _copy_next_values(max_fetch, dst->data()); - *n = max_fetch; - _cur_index += max_fetch; - - return Status::OK(); - } - - size_t count() const override { - return _num_elements; - } - - size_t current_index() const override { - return _cur_index; - } - -private: - void _copy_next_values(size_t n, void* data) { - memcpy(data, &_decoded[_cur_index * SIZE_OF_TYPE], n * SIZE_OF_TYPE); - } - - Status _decode() { - if (_num_elements > 0) { - int64_t bytes; - _decoded.resize(_num_element_after_padding * _size_of_element); - char* in = const_cast(&_data[BITSHUFFLE_PAGE_HEADER_SIZE]); - bytes = bitshuffle::decompress_lz4(in, _decoded.data(), _num_element_after_padding, - _size_of_element, 0); - if (PREDICT_FALSE(bytes < 0)) { - // Ideally, this should not happen. - warn_with_bitshuffle_error(bytes); - return Status::RuntimeError("Unshuffle Process failed"); - } - } - return Status::OK(); - } - - typedef typename TypeTraits::CppType CppType; - - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - Slice _data; - PageDecoderOptions _options; - bool _parsed; - size_t _num_elements; - size_t _compressed_size; - size_t _num_element_after_padding; - - int _size_of_element; - size_t _cur_index; - faststring _decoded; -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "util/coding.h" +#include "util/faststring.h" +#include "gutil/port.h" +#include "olap/olap_common.h" +#include "olap/types.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +namespace doris { +namespace segment_v2 { + +enum { + BITSHUFFLE_PAGE_HEADER_SIZE = 16 +}; + +void warn_with_bitshuffle_error(int64_t val); + +// BitshufflePageBuilder bitshuffles and compresses the bits of fixed +// size type blocks with lz4. +// +// The page format is as follows: +// +// 1. Header: (16 bytes total) +// +// [32-bit] +// The number of elements encoded in the page. +// +// [32-bit] +// The post-compression size of the page, including this header. +// +// [32-bit] +// Padding is needed to meet the requirements of the bitshuffle +// library such that the input/output is a multiple of 8. Some +// ignored elements are appended to the end of the page if necessary +// to meet this requirement. +// +// This header field is the post-padding element count. +// +// [32-bit] +// The size of the elements, in bytes, as actually encoded. In the +// case that all of the data in a page can fit into a smaller +// integer type, then we may choose to encode that smaller type +// to save CPU costs. +// +// This is currently only implemented in the UINT32 page type. +// +// NOTE: all on-disk ints are encoded little-endian +// +// 2. Element data +// +// The header is followed by the bitshuffle-compressed element data. +// +template +class BitshufflePageBuilder : public PageBuilder { +public: + BitshufflePageBuilder(const PageBuilderOptions& options) : + _options(options), + _count(0), + _remain_element_capacity(0), + _finished(false) { + reset(); + } + + bool is_page_full() override { + return _remain_element_capacity == 0; + } + + Status add(const uint8_t* vals, size_t* count) override { + DCHECK(!_finished); + int to_add = std::min(_remain_element_capacity, *count); + _data.append(vals, to_add * SIZE_OF_TYPE); + _count += to_add; + _remain_element_capacity -= to_add; + // return added number through count + *count = to_add; + return Status::OK(); + } + + Slice finish() override { + return _finish(SIZE_OF_TYPE); + } + + void reset() override { + auto block_size = _options.data_page_size; + _count = 0; + _data.clear(); + _data.reserve(block_size); + DCHECK_EQ(reinterpret_cast(_data.data()) & (alignof(CppType) - 1), 0) + << "buffer must be naturally-aligned"; + _buffer.clear(); + _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE); + _finished = false; + _remain_element_capacity = block_size / SIZE_OF_TYPE; + } + + size_t count() const { + return _count; + } + + uint64_t size() const override { + return _buffer.size(); + } + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buffer.release(); + (void)ret; + } + +private: + Slice _finish(int final_size_of_type) { + _data.resize(final_size_of_type * _count); + + // Do padding so that the input num of element is multiple of 8. + int num_elems_after_padding = ALIGN_UP(_count, 8); + int padding_elems = num_elems_after_padding - _count; + int padding_bytes = padding_elems * final_size_of_type; + for (int i = 0; i < padding_bytes; i++) { + _data.push_back(0); + } + + _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE + + bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); + + encode_fixed32_le(&_buffer[0], _count); + int64_t bytes = bitshuffle::compress_lz4(_data.data(), &_buffer[BITSHUFFLE_PAGE_HEADER_SIZE], + num_elems_after_padding, final_size_of_type, 0); + if (PREDICT_FALSE(bytes < 0)) { + // This means the bitshuffle function fails. + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + // It does not matter what will be returned here, + // since we have logged fatal in warn_with_bitshuffle_error(). + return Slice(); + } + encode_fixed32_le(&_buffer[4], BITSHUFFLE_PAGE_HEADER_SIZE + bytes); + encode_fixed32_le(&_buffer[8], num_elems_after_padding); + encode_fixed32_le(&_buffer[12], final_size_of_type); + _finished = true; + return Slice(_buffer.data(), BITSHUFFLE_PAGE_HEADER_SIZE + bytes); + } + + typedef typename TypeTraits::CppType CppType; + + CppType cell(int idx) const { + DCHECK_GE(idx, 0); + CppType ret; + memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); + return ret; + } + + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + PageBuilderOptions _options; + uint32_t _count; + int _remain_element_capacity; + bool _finished; + faststring _data; + faststring _buffer; +}; + +template +class BitShufflePageDecoder : public PageDecoder { +public: + BitShufflePageDecoder(Slice data, const PageDecoderOptions& options) : _data(data), + _options(options), + _parsed(false), + _num_elements(0), + _compressed_size(0), + _num_element_after_padding(0), + _size_of_element(0), + _cur_index(0) { } + + Status init() override { + CHECK(!_parsed); + if (_data.size < BITSHUFFLE_PAGE_HEADER_SIZE) { + std::stringstream ss; + ss << "file corrupton: invalid data size:" << _data.size << ", header size:" << BITSHUFFLE_PAGE_HEADER_SIZE; + return Status::InternalError(ss.str()); + } + _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); + _compressed_size = decode_fixed32_le((const uint8_t*)&_data[4]); + if (_compressed_size != _data.size) { + std::stringstream ss; + ss << "Size information unmatched, _compressed_size:" << _compressed_size + << ", _num_elements:" << _num_elements + << ", data size:" << _data.size; + return Status::InternalError(ss.str()); + } + _num_element_after_padding = decode_fixed32_le((const uint8_t*)&_data[8]); + if (_num_element_after_padding != ALIGN_UP(_num_elements, 8)) { + std::stringstream ss; + ss << "num of element information corrupted," + << " _num_element_after_padding:" << _num_element_after_padding + << ", _num_elements:" << _num_elements; + return Status::InternalError(ss.str()); + } + _size_of_element = decode_fixed32_le((const uint8_t*)&_data[12]); + switch (_size_of_element) { + case 1: + case 2: + case 3: + case 4: + case 8: + case 12: + case 16: + break; + default: + std::stringstream ss; + ss << "invalid size_of_elem:" << _size_of_element; + return Status::InternalError(ss.str()); + } + + // Currently, only the UINT32 block encoder supports expanding size: + if (UNLIKELY(Type != OLAP_FIELD_TYPE_UNSIGNED_INT && _size_of_element != SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE + << ", type:" << Type; + return Status::InternalError(ss.str()); + } + if (UNLIKELY(_size_of_element > SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE; + return Status::InternalError(ss.str()); + } + + RETURN_IF_ERROR(_decode()); + _parsed = true; + return Status::OK(); + } + + Status seek_to_position_in_page(size_t pos) override { + DCHECK(_parsed) << "Must call init()"; + if (PREDICT_FALSE(_num_elements == 0)) { + DCHECK_EQ(0, pos); + return Status::InvalidArgument("invalid pos"); + } + + DCHECK_LE(pos, _num_elements); + _cur_index = pos; + return Status::OK(); + } + + Status next_batch(size_t* n, ColumnBlockView* dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); + _copy_next_values(max_fetch, dst->data()); + *n = max_fetch; + _cur_index += max_fetch; + + return Status::OK(); + } + + size_t count() const override { + return _num_elements; + } + + size_t current_index() const override { + return _cur_index; + } + +private: + void _copy_next_values(size_t n, void* data) { + memcpy(data, &_decoded[_cur_index * SIZE_OF_TYPE], n * SIZE_OF_TYPE); + } + + Status _decode() { + if (_num_elements > 0) { + int64_t bytes; + _decoded.resize(_num_element_after_padding * _size_of_element); + char* in = const_cast(&_data[BITSHUFFLE_PAGE_HEADER_SIZE]); + bytes = bitshuffle::decompress_lz4(in, _decoded.data(), _num_element_after_padding, + _size_of_element, 0); + if (PREDICT_FALSE(bytes < 0)) { + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + return Status::RuntimeError("Unshuffle Process failed"); + } + } + return Status::OK(); + } + + typedef typename TypeTraits::CppType CppType; + + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + Slice _data; + PageDecoderOptions _options; + bool _parsed; + size_t _num_elements; + size_t _compressed_size; + size_t _num_element_after_padding; + + int _size_of_element; + size_t _cur_index; + faststring _decoded; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp index 36ceb8ce392e9b..22c280ae1a8de8 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -1,81 +1,81 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" - -// Include the bitshuffle header once to get the default (non-AVX2) -// symbols. -#include - -#include "gutil/cpu.h" - -// Include the bitshuffle header again, but this time importing the -// AVX2-compiled symbols by defining some macros. -#undef BITSHUFFLE_H -#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 -#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 -#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 -#include // NOLINT(*) -#undef bshuf_compress_lz4_bound -#undef bshuf_compress_lz4 -#undef bshuf_decompress_lz4 - -using base::CPU; - -namespace doris { -namespace bitshuffle { - -// Function pointers which will be assigned the correct implementation -// for the runtime architecture. -namespace { -decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; -decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; -decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; -} // anonymous namespace - -// When this translation unit is initialized, figure out the current CPU and -// assign the correct function for this architecture. -// -// This avoids an expensive 'cpuid' call in the hot path, and also avoids -// the cost of a 'std::once' call. -__attribute__((constructor)) -void SelectBitshuffleFunctions() { - if (CPU().has_avx2()) { - g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; - g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; - g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; - } else { - g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; - g_bshuf_compress_lz4 = bshuf_compress_lz4; - g_bshuf_decompress_lz4 = bshuf_decompress_lz4; - } -} - -int64_t compress_lz4(void* in, void* out, size_t size, - size_t elem_size, size_t block_size) { - return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); -} -int64_t decompress_lz4(void* in, void* out, size_t size, - size_t elem_size, size_t block_size) { - return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); -} -size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { - return g_bshuf_compress_lz4_bound(size, elem_size, block_size); -} - -} // namespace bitshuffle -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +// Include the bitshuffle header once to get the default (non-AVX2) +// symbols. +#include + +#include "gutil/cpu.h" + +// Include the bitshuffle header again, but this time importing the +// AVX2-compiled symbols by defining some macros. +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 +#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 +#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 +#include // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + +using base::CPU; + +namespace doris { +namespace bitshuffle { + +// Function pointers which will be assigned the correct implementation +// for the runtime architecture. +namespace { +decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; +decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; +decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; +} // anonymous namespace + +// When this translation unit is initialized, figure out the current CPU and +// assign the correct function for this architecture. +// +// This avoids an expensive 'cpuid' call in the hot path, and also avoids +// the cost of a 'std::once' call. +__attribute__((constructor)) +void SelectBitshuffleFunctions() { + if (CPU().has_avx2()) { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; + g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; + } else { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; + g_bshuf_compress_lz4 = bshuf_compress_lz4; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4; + } +} + +int64_t compress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); +} +int64_t decompress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); +} +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4_bound(size, elem_size, block_size); +} + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h index 38c1e7231f947c..4846438130baeb 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h @@ -1,34 +1,34 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to -// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. -namespace doris { -namespace bitshuffle { - -// See for documentation on these functions. -size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); -int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); -int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); - -} // namespace bitshuffle -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to +// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. +namespace doris { +namespace bitshuffle { + +// See for documentation on these functions. +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); +int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); +int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_builder.h b/be/src/olap/rowset/segment_v2/page_builder.h index 4ef0701588767b..c2cc0eb8136089 100644 --- a/be/src/olap/rowset/segment_v2/page_builder.h +++ b/be/src/olap/rowset/segment_v2/page_builder.h @@ -1,87 +1,87 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "gutil/macros.h" -#include "util/slice.h" -#include "common/status.h" -#include "olap/rowset/segment_v2/common.h" - -namespace doris { -namespace segment_v2 { - -// PageBuilder is used to build page -// Page is a data management unit, including: -// 1. Data Page: store encoded and compressed data -// 2. BloomFilter Page: store bloom filter of data -// 3. Ordinal Index Page: store ordinal index of data -// 4. Short Key Index Page: store short key index of data -// 5. Bitmap Index Page: store bitmap index of data -class PageBuilder { -public: - PageBuilder() { } - - virtual ~PageBuilder() { } - - // Used by column writer to determine whether the current page is full. - // Column writer depends on the result to decide whether to flush current page. - virtual bool is_page_full() = 0; - - // Add a sequence of values to the page. - // The number of values actually added will be returned through count, which may be less - // than requested if the page is full. - // - // vals size should be decided according to the page build type - virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; - - // Get the dictionary page for dictionary encoding mode column. - virtual Status get_dictionary_page(Slice* dictionary_page) { - return Status::NotSupported("get_dictionary_page not implemented"); - } - - // Return a Slice which represents the encoded data of current page. - // - // This Slice points to internal data of this builder. - virtual Slice finish() = 0; - - // Reset the internal state of the page builder. - // - // Any data previously returned by finish may be invalidated by this call. - virtual void reset() = 0; - - // Return the number of entries that have been added to the page. - virtual size_t count() const = 0; - - // Return the total bytes of pageBuilder that have been added to the page. - virtual uint64_t size() const = 0; - - // This api is for release the resource owned by builder - // It means it will transfer the ownership of some resource to other. - // This api is always called after finish - // and should be followed by reset() before reuse the builder - virtual void release() = 0; - -private: - DISALLOW_COPY_AND_ASSIGN(PageBuilder); -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "gutil/macros.h" +#include "util/slice.h" +#include "common/status.h" +#include "olap/rowset/segment_v2/common.h" + +namespace doris { +namespace segment_v2 { + +// PageBuilder is used to build page +// Page is a data management unit, including: +// 1. Data Page: store encoded and compressed data +// 2. BloomFilter Page: store bloom filter of data +// 3. Ordinal Index Page: store ordinal index of data +// 4. Short Key Index Page: store short key index of data +// 5. Bitmap Index Page: store bitmap index of data +class PageBuilder { +public: + PageBuilder() { } + + virtual ~PageBuilder() { } + + // Used by column writer to determine whether the current page is full. + // Column writer depends on the result to decide whether to flush current page. + virtual bool is_page_full() = 0; + + // Add a sequence of values to the page. + // The number of values actually added will be returned through count, which may be less + // than requested if the page is full. + // + // vals size should be decided according to the page build type + virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; + + // Get the dictionary page for dictionary encoding mode column. + virtual Status get_dictionary_page(Slice* dictionary_page) { + return Status::NotSupported("get_dictionary_page not implemented"); + } + + // Return a Slice which represents the encoded data of current page. + // + // This Slice points to internal data of this builder. + virtual Slice finish() = 0; + + // Reset the internal state of the page builder. + // + // Any data previously returned by finish may be invalidated by this call. + virtual void reset() = 0; + + // Return the number of entries that have been added to the page. + virtual size_t count() const = 0; + + // Return the total bytes of pageBuilder that have been added to the page. + virtual uint64_t size() const = 0; + + // This api is for release the resource owned by builder + // It means it will transfer the ownership of some resource to other. + // This api is always called after finish + // and should be followed by reset() before reuse the builder + virtual void release() = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageBuilder); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index a6e4c47ef2e672..490f45f4983f7f 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -1,79 +1,79 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "olap/column_block.h" // for ColumnBlockView -#include "olap/rowset/segment_v2/common.h" // for rowid_t -#include "common/status.h" // for Status - -namespace doris { -namespace segment_v2 { - -// PageDecoder is used to decode page. -class PageDecoder { -public: - PageDecoder() { } - - virtual ~PageDecoder() { } - - // Call this to do some preparation for decoder. - // eg: parse data page header - virtual Status init() = 0; - - // Seek the decoder to the given positional index of the page. - // For example, seek_to_position_in_page(0) seeks to the first - // stored entry. - // - // It is an error to call this with a value larger than Count(). - // Doing so has undefined results. - virtual Status seek_to_position_in_page(size_t pos) = 0; - - // Seek the decoder forward by a given number of rows, or to the end - // of the page. This is primarily used to skip over data. - // - // Return the step skipped. - virtual size_t seek_forward(size_t n) { - size_t step = std::min(n, count() - current_index()); - DCHECK_GE(step, 0); - seek_to_position_in_page(current_index() + step); - return step; - } - - // Fetch the next vector of values from the page into 'column_vector_view'. - // The output vector must have space for up to n cells. - // - // Return the size of read entries . - // - // In the case that the values are themselves references - // to other memory (eg Slices), the referred-to memory is - // allocated in the column_vector_view's mem_pool. - virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0; - - // Return the number of elements in this page. - virtual size_t count() const = 0; - - // Return the position within the page of the currently seeked - // entry (ie the entry that will next be returned by next_vector()) - virtual size_t current_index() const = 0; - -private: - DISALLOW_COPY_AND_ASSIGN(PageDecoder); -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/column_block.h" // for ColumnBlockView +#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "common/status.h" // for Status + +namespace doris { +namespace segment_v2 { + +// PageDecoder is used to decode page. +class PageDecoder { +public: + PageDecoder() { } + + virtual ~PageDecoder() { } + + // Call this to do some preparation for decoder. + // eg: parse data page header + virtual Status init() = 0; + + // Seek the decoder to the given positional index of the page. + // For example, seek_to_position_in_page(0) seeks to the first + // stored entry. + // + // It is an error to call this with a value larger than Count(). + // Doing so has undefined results. + virtual Status seek_to_position_in_page(size_t pos) = 0; + + // Seek the decoder forward by a given number of rows, or to the end + // of the page. This is primarily used to skip over data. + // + // Return the step skipped. + virtual size_t seek_forward(size_t n) { + size_t step = std::min(n, count() - current_index()); + DCHECK_GE(step, 0); + seek_to_position_in_page(current_index() + step); + return step; + } + + // Fetch the next vector of values from the page into 'column_vector_view'. + // The output vector must have space for up to n cells. + // + // Return the size of read entries . + // + // In the case that the values are themselves references + // to other memory (eg Slices), the referred-to memory is + // allocated in the column_vector_view's mem_pool. + virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0; + + // Return the number of elements in this page. + virtual size_t count() const = 0; + + // Return the position within the page of the currently seeked + // entry (ie the entry that will next be returned by next_vector()) + virtual size_t current_index() const = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageDecoder); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/rle_page.h b/be/src/olap/rowset/segment_v2/rle_page.h index 36817b7cfd9edf..46fb197312a09d 100644 --- a/be/src/olap/rowset/segment_v2/rle_page.h +++ b/be/src/olap/rowset/segment_v2/rle_page.h @@ -1,256 +1,256 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder -#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder -#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions -#include "olap/rowset/segment_v2/common.h" // for rowid_t -#include "util/rle_encoding.h" // for RleEncoder/RleDecoder -#include "util/coding.h" // for encode_fixed32_le/decode_fixed32_le - -namespace doris { -namespace segment_v2 { - -enum { - RLE_PAGE_HEADER_SIZE = 4 -}; - -// RLE builder for generic integer and bool types. What is missing is some way -// to enforce that this can only be instantiated for INT and BOOL types. -// -// The page format is as follows: -// -// 1. Header: (4 bytes total) -// -// [32-bit] -// The number of elements encoded in the page. -// -// NOTE: all on-disk ints are encoded little-endian -// -// 2. Element data -// -// The header is followed by the rle-encoded element data. -// -// This Rle encoding algorithm is only effective for repeated INT type and bool type, -// It is not good for sequence number or random number. BitshufflePage is recommended -// for these case. -// -// TODO(hkp): optimize rle algorithm -template -class RlePageBuilder : public PageBuilder { -public: - RlePageBuilder(const PageBuilderOptions& options) : - _options(options), - _count(0), - _finished(false), - _bit_width(0), - _rle_encoder(nullptr) { - switch(Type) { - case OLAP_FIELD_TYPE_BOOL: { - _bit_width = 1; - break; - } - default: { - _bit_width = SIZE_OF_TYPE * 8; - break; - } - } - _rle_encoder = new RleEncoder(&_buf, _bit_width); - reset(); - } - - ~RlePageBuilder() { - delete _rle_encoder; - } - - bool is_page_full() override { - return _rle_encoder->len() >= _options.data_page_size; - } - - Status add(const uint8_t* vals, size_t* count) override { - DCHECK(!_finished); - DCHECK_EQ(reinterpret_cast(vals) & (alignof(CppType) - 1), 0) - << "Pointer passed to Add() must be naturally-aligned"; - - const CppType* new_vals = reinterpret_cast(vals); - for (int i = 0; i < *count; ++i) { - _rle_encoder->Put(new_vals[i]); - } - - _count += *count; - return Status::OK(); - } - - Slice finish() override { - _finished = true; - // here should Flush first and then encode the count header - // or it will lead to a bug if the header is less than 8 byte and the data is small - _rle_encoder->Flush(); - encode_fixed32_le(&_buf[0], _count); - return Slice(_buf.data(), _buf.size()); - } - - void reset() override { - _count = 0; - _rle_encoder->Clear(); - _rle_encoder->Reserve(RLE_PAGE_HEADER_SIZE, 0); - } - - size_t count() const override { - return _count; - } - - uint64_t size() const override { - return _rle_encoder->len(); - } - - // this api will release the memory ownership of encoded data - // Note: - // release() should be called after finish - // reset() should be called after this function before reuse the builder - void release() override { - uint8_t* ret = _buf.release(); - (void)ret; - } - -private: - typedef typename TypeTraits::CppType CppType; - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - PageBuilderOptions _options; - size_t _count; - bool _finished; - int _bit_width; - RleEncoder* _rle_encoder; - faststring _buf; -}; - -template -class RlePageDecoder : public PageDecoder { -public: - RlePageDecoder(Slice slice, const PageDecoderOptions& options) : - _data(slice), - _options(options), - _parsed(false), - _num_elements(0), - _cur_index(0), - _bit_width(0) { } - - Status init() override { - CHECK(!_parsed); - - if (_data.size < RLE_PAGE_HEADER_SIZE) { - return Status::Corruption( - "not enough bytes for header in RleBitMapBlockDecoder"); - } - _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); - - _parsed = true; - - switch(Type) { - case OLAP_FIELD_TYPE_BOOL: { - _bit_width = 1; - break; - } - default: { - _bit_width = SIZE_OF_TYPE * 8; - break; - } - } - - _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, - _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); - - seek_to_position_in_page(0); - return Status::OK(); - } - - Status seek_to_position_in_page(size_t pos) override { - DCHECK(_parsed) << "Must call init()"; - DCHECK_LE(pos, _num_elements) << "Tried to seek to " << pos << " which is > number of elements (" - << _num_elements << ") in the block!"; - // If the block is empty (e.g. the column is filled with nulls), there is no data to seek. - if (PREDICT_FALSE(_num_elements == 0)) { - return Status::OK(); - } - if (_cur_index == pos) { - // No need to seek. - return Status::OK(); - } else if (_cur_index < pos) { - uint nskip = pos - _cur_index; - _rle_decoder.Skip(nskip); - } else { - _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, - _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); - _rle_decoder.Skip(pos); - } - _cur_index = pos; - return Status::OK(); - } - - Status next_batch(size_t* n, ColumnBlockView* dst) override { - DCHECK(_parsed); - if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { - *n = 0; - return Status::OK(); - } - - size_t to_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); - size_t remaining = to_fetch; - uint8_t* data_ptr = dst->data(); - bool result = false; - while (remaining > 0) { - result = _rle_decoder.Get(reinterpret_cast(data_ptr)); - DCHECK(result); - remaining--; - data_ptr += SIZE_OF_TYPE; - } - - _cur_index += to_fetch; - *n = to_fetch; - return Status::OK(); - } - - size_t count() const override { - return _num_elements; - } - - size_t current_index() const override { - return _cur_index; - } - -private: - typedef typename TypeTraits::CppType CppType; - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - Slice _data; - PageDecoderOptions _options; - bool _parsed; - uint32_t _num_elements; - size_t _cur_index; - int _bit_width; - RleDecoder _rle_decoder; -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder +#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder +#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions +#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "util/rle_encoding.h" // for RleEncoder/RleDecoder +#include "util/coding.h" // for encode_fixed32_le/decode_fixed32_le + +namespace doris { +namespace segment_v2 { + +enum { + RLE_PAGE_HEADER_SIZE = 4 +}; + +// RLE builder for generic integer and bool types. What is missing is some way +// to enforce that this can only be instantiated for INT and BOOL types. +// +// The page format is as follows: +// +// 1. Header: (4 bytes total) +// +// [32-bit] +// The number of elements encoded in the page. +// +// NOTE: all on-disk ints are encoded little-endian +// +// 2. Element data +// +// The header is followed by the rle-encoded element data. +// +// This Rle encoding algorithm is only effective for repeated INT type and bool type, +// It is not good for sequence number or random number. BitshufflePage is recommended +// for these case. +// +// TODO(hkp): optimize rle algorithm +template +class RlePageBuilder : public PageBuilder { +public: + RlePageBuilder(const PageBuilderOptions& options) : + _options(options), + _count(0), + _finished(false), + _bit_width(0), + _rle_encoder(nullptr) { + switch(Type) { + case OLAP_FIELD_TYPE_BOOL: { + _bit_width = 1; + break; + } + default: { + _bit_width = SIZE_OF_TYPE * 8; + break; + } + } + _rle_encoder = new RleEncoder(&_buf, _bit_width); + reset(); + } + + ~RlePageBuilder() { + delete _rle_encoder; + } + + bool is_page_full() override { + return _rle_encoder->len() >= _options.data_page_size; + } + + Status add(const uint8_t* vals, size_t* count) override { + DCHECK(!_finished); + DCHECK_EQ(reinterpret_cast(vals) & (alignof(CppType) - 1), 0) + << "Pointer passed to Add() must be naturally-aligned"; + + const CppType* new_vals = reinterpret_cast(vals); + for (int i = 0; i < *count; ++i) { + _rle_encoder->Put(new_vals[i]); + } + + _count += *count; + return Status::OK(); + } + + Slice finish() override { + _finished = true; + // here should Flush first and then encode the count header + // or it will lead to a bug if the header is less than 8 byte and the data is small + _rle_encoder->Flush(); + encode_fixed32_le(&_buf[0], _count); + return Slice(_buf.data(), _buf.size()); + } + + void reset() override { + _count = 0; + _rle_encoder->Clear(); + _rle_encoder->Reserve(RLE_PAGE_HEADER_SIZE, 0); + } + + size_t count() const override { + return _count; + } + + uint64_t size() const override { + return _rle_encoder->len(); + } + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buf.release(); + (void)ret; + } + +private: + typedef typename TypeTraits::CppType CppType; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + PageBuilderOptions _options; + size_t _count; + bool _finished; + int _bit_width; + RleEncoder* _rle_encoder; + faststring _buf; +}; + +template +class RlePageDecoder : public PageDecoder { +public: + RlePageDecoder(Slice slice, const PageDecoderOptions& options) : + _data(slice), + _options(options), + _parsed(false), + _num_elements(0), + _cur_index(0), + _bit_width(0) { } + + Status init() override { + CHECK(!_parsed); + + if (_data.size < RLE_PAGE_HEADER_SIZE) { + return Status::Corruption( + "not enough bytes for header in RleBitMapBlockDecoder"); + } + _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); + + _parsed = true; + + switch(Type) { + case OLAP_FIELD_TYPE_BOOL: { + _bit_width = 1; + break; + } + default: { + _bit_width = SIZE_OF_TYPE * 8; + break; + } + } + + _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, + _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); + + seek_to_position_in_page(0); + return Status::OK(); + } + + Status seek_to_position_in_page(size_t pos) override { + DCHECK(_parsed) << "Must call init()"; + DCHECK_LE(pos, _num_elements) << "Tried to seek to " << pos << " which is > number of elements (" + << _num_elements << ") in the block!"; + // If the block is empty (e.g. the column is filled with nulls), there is no data to seek. + if (PREDICT_FALSE(_num_elements == 0)) { + return Status::OK(); + } + if (_cur_index == pos) { + // No need to seek. + return Status::OK(); + } else if (_cur_index < pos) { + uint nskip = pos - _cur_index; + _rle_decoder.Skip(nskip); + } else { + _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, + _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); + _rle_decoder.Skip(pos); + } + _cur_index = pos; + return Status::OK(); + } + + Status next_batch(size_t* n, ColumnBlockView* dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + size_t to_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); + size_t remaining = to_fetch; + uint8_t* data_ptr = dst->data(); + bool result = false; + while (remaining > 0) { + result = _rle_decoder.Get(reinterpret_cast(data_ptr)); + DCHECK(result); + remaining--; + data_ptr += SIZE_OF_TYPE; + } + + _cur_index += to_fetch; + *n = to_fetch; + return Status::OK(); + } + + size_t count() const override { + return _num_elements; + } + + size_t current_index() const override { + return _cur_index; + } + +private: + typedef typename TypeTraits::CppType CppType; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + Slice _data; + PageDecoderOptions _options; + bool _parsed; + uint32_t _num_elements; + size_t _cur_index; + int _bit_width; + RleDecoder _rle_decoder; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/udf/CMakeLists.txt b/be/src/udf/CMakeLists.txt index 1587d0176f6ead..c8a5b05d67904e 100755 --- a/be/src/udf/CMakeLists.txt +++ b/be/src/udf/CMakeLists.txt @@ -15,43 +15,43 @@ # specific language governing permissions and limitations # under the License. -# where to put generated libraries +# where to put generated libraries set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/udf") - -# where to put generated binaries -set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf") - -# Build this library twice. Once to be linked into the main Doris. This version -# can have dependencies on our other libs. The second version is shipped as part -# of the UDF sdk, which can't use other libs. -add_library(Udf udf.cpp udf_ir.cpp) -add_library(DorisUdf udf.cpp udf_ir.cpp) -set_target_properties(DorisUdf PROPERTIES COMPILE_FLAGS "-DDORIS_UDF_SDK_BUILD") - -# We can't use the normal link list since we want to pick up libDorisUdf (the external -# library) rather than the interal libUdf. -set (UDF_TEST_LINK_LIBS - -Wl,--start-group - Common - GlobalFlags - DorisUdf - Runtime - Util - -Wl,--end-group -# Below are all external dependencies. They should some after the doris libs. - ${Boost_LIBRARIES} - glogstatic - gflagsstatic - -lboost_date_time - gtest) - +set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/udf") + +# where to put generated binaries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf") + +# Build this library twice. Once to be linked into the main Doris. This version +# can have dependencies on our other libs. The second version is shipped as part +# of the UDF sdk, which can't use other libs. +add_library(Udf udf.cpp udf_ir.cpp) +add_library(DorisUdf udf.cpp udf_ir.cpp) +set_target_properties(DorisUdf PROPERTIES COMPILE_FLAGS "-DDORIS_UDF_SDK_BUILD") + +# We can't use the normal link list since we want to pick up libDorisUdf (the external +# library) rather than the interal libUdf. +set (UDF_TEST_LINK_LIBS + -Wl,--start-group + Common + GlobalFlags + DorisUdf + Runtime + Util + -Wl,--end-group +# Below are all external dependencies. They should some after the doris libs. + ${Boost_LIBRARIES} + glogstatic + gflagsstatic + -lboost_date_time + gtest) + set_target_properties(DorisUdf PROPERTIES PUBLIC_HEADER "udf.h;uda_test_harness.h") INSTALL(TARGETS DorisUdf ARCHIVE DESTINATION ${OUTPUT_DIR}/udf LIBRARY DESTINATION ${OUTPUT_DIR}/udf/lib PUBLIC_HEADER DESTINATION ${OUTPUT_DIR}/udf/include) -#ADD_BE_TEST(udf_test) -#ADD_BE_TEST(uda_test) +#ADD_BE_TEST(udf_test) +#ADD_BE_TEST(uda_test) diff --git a/be/src/util/alignment.h b/be/src/util/alignment.h index e1cc759d71099b..43802805fb80e9 100644 --- a/be/src/util/alignment.h +++ b/be/src/util/alignment.h @@ -1,26 +1,26 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// -// Macros for dealing with memory alignment. -#pragma once - -// Round down 'x' to the nearest 'align' boundary -#define ALIGN_DOWN(x, align) ((x) & (~(align) + 1)) - -// Round up 'x' to the nearest 'align' boundary -#define ALIGN_UP(x, align) (((x) + ((align) - 1)) & (~(align) + 1)) - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Macros for dealing with memory alignment. +#pragma once + +// Round down 'x' to the nearest 'align' boundary +#define ALIGN_DOWN(x, align) ((x) & (~(align) + 1)) + +// Round up 'x' to the nearest 'align' boundary +#define ALIGN_UP(x, align) (((x) + ((align) - 1)) & (~(align) + 1)) + diff --git a/be/src/util/bit_stream_utils.h b/be/src/util/bit_stream_utils.h index 220c8cb4f1f00e..cc463c346ffa5d 100644 --- a/be/src/util/bit_stream_utils.h +++ b/be/src/util/bit_stream_utils.h @@ -1,149 +1,149 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#pragma once - -#include "gutil/port.h" -#include "util/bit_util.h" -#include "util/faststring.h" - -using doris::BitUtil; - -namespace doris { - -// Utility class to write bit/byte streams. This class can write data to either be -// bit packed or byte aligned (and a single stream that has a mix of both). -class BitWriter { - public: - // buffer: buffer to write bits to. - explicit BitWriter(faststring *buffer) - : buffer_(buffer) { - Clear(); - } - - void Clear() { - buffered_values_ = 0; - byte_offset_ = 0; - bit_offset_ = 0; - buffer_->clear(); - } - - // Returns a pointer to the underlying buffer - faststring *buffer() const { return buffer_; } - - // The number of current bytes written, including the current byte (i.e. may include a - // fraction of a byte). Includes buffered values. - int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } - - // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - // packed. - void PutValue(uint64_t v, int num_bits); - - // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the - // extra high-order bits will be ignored. - template - void PutAligned(T v, int num_bits); - - // Write a Vlq encoded int to the buffer. The value is written byte aligned. - // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity - void PutVlqInt(int32_t v); - - // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. - size_t GetByteIndexAndAdvance(int num_bytes) { - uint8_t* ptr = GetNextBytePtr(num_bytes); - return ptr - buffer_->data(); - } - - // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. - uint8_t* GetNextBytePtr(int num_bytes); - - // Flushes all buffered values to the buffer. Call this when done writing to the buffer. - // If 'align' is true, buffered_values_ is reset and any future writes will be written - // to the next byte boundary. - void Flush(bool align = false); - - private: - // Bit-packed values are initially written to this variable before being memcpy'd to - // buffer_. This is faster than writing values byte by byte directly to buffer_. - uint64_t buffered_values_; - - faststring *buffer_; - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -// Utility class to read bit/byte stream. This class can read bits or bytes -// that are either byte aligned or not. It also has utilities to read multiple -// bytes in one read (e.g. encoded int). -class BitReader { - public: - // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. - BitReader(const uint8_t* buffer, int buffer_len); - - BitReader() : buffer_(NULL), max_bytes_(0) {} - - // Gets the next value from the buffer. Returns true if 'v' could be read or false if - // there are not enough bytes left. num_bits must be <= 32. - template - bool GetValue(int num_bits, T* v); - - // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a - // little-endian native type and big enough to store 'num_bytes'. The value is assumed - // to be byte-aligned so the stream will be advanced to the start of the next byte - // before 'v' is read. Returns false if there are not enough bytes left. - template - bool GetAligned(int num_bytes, T* v); - - // Reads a vlq encoded int from the stream. The encoded int must start at the - // beginning of a byte. Return false if there were not enough bytes in the buffer. - bool GetVlqInt(int32_t* v); - - // Returns the number of bytes left in the stream, not including the current byte (i.e., - // there may be an additional fraction of a byte). - int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } - - // Current position in the stream, by bit. - int position() const { return byte_offset_ * 8 + bit_offset_; } - - // Rewind the stream by 'num_bits' bits - void Rewind(int num_bits); - - // Seek to a specific bit in the buffer - void SeekToBit(uint stream_position); - - // Maximum byte length of a vlq encoded int - static const int MAX_VLQ_BYTE_LEN = 5; - - bool is_initialized() const { return buffer_ != NULL; } - - private: - // Used by SeekToBit() and GetValue() to fetch the - // the next word into buffer_. - void BufferValues(); - - const uint8_t* buffer_; - int max_bytes_; - - // Bytes are memcpy'd from buffer_ and values are read from this variable. This is - // faster than reading values byte by byte directly from buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -} // namespace doris - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include "gutil/port.h" +#include "util/bit_util.h" +#include "util/faststring.h" + +using doris::BitUtil; + +namespace doris { + +// Utility class to write bit/byte streams. This class can write data to either be +// bit packed or byte aligned (and a single stream that has a mix of both). +class BitWriter { + public: + // buffer: buffer to write bits to. + explicit BitWriter(faststring *buffer) + : buffer_(buffer) { + Clear(); + } + + void Clear() { + buffered_values_ = 0; + byte_offset_ = 0; + bit_offset_ = 0; + buffer_->clear(); + } + + // Returns a pointer to the underlying buffer + faststring *buffer() const { return buffer_; } + + // The number of current bytes written, including the current byte (i.e. may include a + // fraction of a byte). Includes buffered values. + int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } + + // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit + // packed. + void PutValue(uint64_t v, int num_bits); + + // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the + // extra high-order bits will be ignored. + template + void PutAligned(T v, int num_bits); + + // Write a Vlq encoded int to the buffer. The value is written byte aligned. + // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity + void PutVlqInt(int32_t v); + + // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. + size_t GetByteIndexAndAdvance(int num_bytes) { + uint8_t* ptr = GetNextBytePtr(num_bytes); + return ptr - buffer_->data(); + } + + // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. + uint8_t* GetNextBytePtr(int num_bytes); + + // Flushes all buffered values to the buffer. Call this when done writing to the buffer. + // If 'align' is true, buffered_values_ is reset and any future writes will be written + // to the next byte boundary. + void Flush(bool align = false); + + private: + // Bit-packed values are initially written to this variable before being memcpy'd to + // buffer_. This is faster than writing values byte by byte directly to buffer_. + uint64_t buffered_values_; + + faststring *buffer_; + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +// Utility class to read bit/byte stream. This class can read bits or bytes +// that are either byte aligned or not. It also has utilities to read multiple +// bytes in one read (e.g. encoded int). +class BitReader { + public: + // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + BitReader(const uint8_t* buffer, int buffer_len); + + BitReader() : buffer_(NULL), max_bytes_(0) {} + + // Gets the next value from the buffer. Returns true if 'v' could be read or false if + // there are not enough bytes left. num_bits must be <= 32. + template + bool GetValue(int num_bits, T* v); + + // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a + // little-endian native type and big enough to store 'num_bytes'. The value is assumed + // to be byte-aligned so the stream will be advanced to the start of the next byte + // before 'v' is read. Returns false if there are not enough bytes left. + template + bool GetAligned(int num_bytes, T* v); + + // Reads a vlq encoded int from the stream. The encoded int must start at the + // beginning of a byte. Return false if there were not enough bytes in the buffer. + bool GetVlqInt(int32_t* v); + + // Returns the number of bytes left in the stream, not including the current byte (i.e., + // there may be an additional fraction of a byte). + int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } + + // Current position in the stream, by bit. + int position() const { return byte_offset_ * 8 + bit_offset_; } + + // Rewind the stream by 'num_bits' bits + void Rewind(int num_bits); + + // Seek to a specific bit in the buffer + void SeekToBit(uint stream_position); + + // Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; + + bool is_initialized() const { return buffer_ != NULL; } + + private: + // Used by SeekToBit() and GetValue() to fetch the + // the next word into buffer_. + void BufferValues(); + + const uint8_t* buffer_; + int max_bytes_; + + // Bytes are memcpy'd from buffer_ and values are read from this variable. This is + // faster than reading values byte by byte directly from buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +} // namespace doris + diff --git a/be/src/util/bit_stream_utils.inline.h b/be/src/util/bit_stream_utils.inline.h index 1bbabd789b016e..deac875ce2e3ad 100644 --- a/be/src/util/bit_stream_utils.inline.h +++ b/be/src/util/bit_stream_utils.inline.h @@ -1,213 +1,213 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H -#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H - -#include - -#include "glog/logging.h" -#include "util/bit_stream_utils.h" -#include "util/alignment.h" - -using doris::BitUtil; - -namespace doris { - -inline void BitWriter::PutValue(uint64_t v, int num_bits) { - DCHECK_LE(num_bits, 64); - // Truncate the higher-order bits. This is necessary to - // support signed values. - v &= ~0ULL >> (64 - num_bits); - - - buffered_values_ |= v << bit_offset_; - bit_offset_ += num_bits; - - if (PREDICT_FALSE(bit_offset_ >= 64)) { - // Flush buffered_values_ and write out bits of v that did not fit - buffer_->reserve(ALIGN_UP(byte_offset_ + 8, 8)); - buffer_->resize(byte_offset_ + 8); - DCHECK_LE(byte_offset_ + 8, buffer_->capacity()); - memcpy(buffer_->data() + byte_offset_, &buffered_values_, 8); - buffered_values_ = 0; - byte_offset_ += 8; - bit_offset_ -= 64; - buffered_values_ = BitUtil::ShiftRightZeroOnOverflow(v, (num_bits - bit_offset_)); - } - DCHECK_LT(bit_offset_, 64); -} - -inline void BitWriter::Flush(bool align) { - int num_bytes = BitUtil::Ceil(bit_offset_, 8); - buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); - buffer_->resize(byte_offset_ + num_bytes); - DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity()); - memcpy(buffer_->data() + byte_offset_, &buffered_values_, num_bytes); - - if (align) { - buffered_values_ = 0; - byte_offset_ += num_bytes; - bit_offset_ = 0; - } -} - -inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { - Flush(/* align */ true); - buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); - buffer_->resize(byte_offset_ + num_bytes); - uint8_t* ptr = buffer_->data() + byte_offset_; - byte_offset_ += num_bytes; - DCHECK_LE(byte_offset_, buffer_->capacity()); - return ptr; -} - -template -inline void BitWriter::PutAligned(T val, int num_bytes) { - DCHECK_LE(num_bytes, sizeof(T)); - uint8_t* ptr = GetNextBytePtr(num_bytes); - memcpy(ptr, &val, num_bytes); -} - -inline void BitWriter::PutVlqInt(int32_t v) { - while ((v & 0xFFFFFF80) != 0L) { - PutAligned((v & 0x7F) | 0x80, 1); - v >>= 7; - } - PutAligned(v & 0x7F, 1); -} - - -inline BitReader::BitReader(const uint8_t* buffer, int buffer_len) - : buffer_(buffer), - max_bytes_(buffer_len), - buffered_values_(0), - byte_offset_(0), - bit_offset_(0) { - int num_bytes = std::min(8, max_bytes_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); -} - -inline void BitReader::BufferValues() { - int bytes_remaining = max_bytes_ - byte_offset_; - if (PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } -} - -template -inline bool BitReader::GetValue(int num_bits, T* v) { - DCHECK_LE(num_bits, 64); - DCHECK_LE(num_bits, sizeof(T) * 8); - - if (PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; - - *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_; - - bit_offset_ += num_bits; - if (bit_offset_ >= 64) { - byte_offset_ += 8; - bit_offset_ -= 64; - BufferValues(); - // Read bits of v that crossed into new buffered_values_ - *v |= BitUtil::ShiftLeftZeroOnOverflow( - BitUtil::TrailingBits(buffered_values_, bit_offset_), - (num_bits - bit_offset_)); - } - DCHECK_LE(bit_offset_, 64); - return true; -} - -inline void BitReader::Rewind(int num_bits) { - bit_offset_ -= num_bits; - if (bit_offset_ >= 0) { - return; - } - while (bit_offset_ < 0) { - int seek_back = std::min(byte_offset_, 8); - byte_offset_ -= seek_back; - bit_offset_ += seek_back * 8; - } - // This should only be executed *if* rewinding by 'num_bits' - // make the existing buffered_values_ invalid - DCHECK_GE(byte_offset_, 0); // Check for underflow - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); -} - -inline void BitReader::SeekToBit(uint stream_position) { - DCHECK_LE(stream_position, max_bytes_ * 8); - - int delta = static_cast(stream_position) - position(); - if (delta == 0) { - return; - } else if (delta < 0) { - Rewind(position() - stream_position); - } else { - bit_offset_ += delta; - while (bit_offset_ >= 64) { - byte_offset_ +=8; - bit_offset_ -= 64; - if (bit_offset_ < 64) { - // This should only be executed if seeking to - // 'stream_position' makes the existing buffered_values_ - // invalid. - BufferValues(); - } - } - } -} - -template -inline bool BitReader::GetAligned(int num_bytes, T* v) { - DCHECK_LE(num_bytes, sizeof(T)); - int bytes_read = BitUtil::Ceil(bit_offset_, 8); - if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; - - // Advance byte_offset to next unread byte and read num_bytes - byte_offset_ += bytes_read; - memcpy(v, buffer_ + byte_offset_, num_bytes); - byte_offset_ += num_bytes; - - // Reset buffered_values_ - bit_offset_ = 0; - int bytes_remaining = max_bytes_ - byte_offset_; - if (PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } - return true; -} - -inline bool BitReader::GetVlqInt(int32_t* v) { - *v = 0; - int shift = 0; - int num_bytes = 0; - uint8_t byte = 0; - do { - if (!GetAligned(1, &byte)) return false; - *v |= (byte & 0x7F) << shift; - shift += 7; - DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); - } while ((byte & 0x80) != 0); - return true; -} - -} // namespace doris - -#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H +#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H + +#include + +#include "glog/logging.h" +#include "util/bit_stream_utils.h" +#include "util/alignment.h" + +using doris::BitUtil; + +namespace doris { + +inline void BitWriter::PutValue(uint64_t v, int num_bits) { + DCHECK_LE(num_bits, 64); + // Truncate the higher-order bits. This is necessary to + // support signed values. + v &= ~0ULL >> (64 - num_bits); + + + buffered_values_ |= v << bit_offset_; + bit_offset_ += num_bits; + + if (PREDICT_FALSE(bit_offset_ >= 64)) { + // Flush buffered_values_ and write out bits of v that did not fit + buffer_->reserve(ALIGN_UP(byte_offset_ + 8, 8)); + buffer_->resize(byte_offset_ + 8); + DCHECK_LE(byte_offset_ + 8, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, 8); + buffered_values_ = 0; + byte_offset_ += 8; + bit_offset_ -= 64; + buffered_values_ = BitUtil::ShiftRightZeroOnOverflow(v, (num_bits - bit_offset_)); + } + DCHECK_LT(bit_offset_, 64); +} + +inline void BitWriter::Flush(bool align) { + int num_bytes = BitUtil::Ceil(bit_offset_, 8); + buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, num_bytes); + + if (align) { + buffered_values_ = 0; + byte_offset_ += num_bytes; + bit_offset_ = 0; + } +} + +inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { + Flush(/* align */ true); + buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + uint8_t* ptr = buffer_->data() + byte_offset_; + byte_offset_ += num_bytes; + DCHECK_LE(byte_offset_, buffer_->capacity()); + return ptr; +} + +template +inline void BitWriter::PutAligned(T val, int num_bytes) { + DCHECK_LE(num_bytes, sizeof(T)); + uint8_t* ptr = GetNextBytePtr(num_bytes); + memcpy(ptr, &val, num_bytes); +} + +inline void BitWriter::PutVlqInt(int32_t v) { + while ((v & 0xFFFFFF80) != 0L) { + PutAligned((v & 0x7F) | 0x80, 1); + v >>= 7; + } + PutAligned(v & 0x7F, 1); +} + + +inline BitReader::BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer), + max_bytes_(buffer_len), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) { + int num_bytes = std::min(8, max_bytes_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); +} + +inline void BitReader::BufferValues() { + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } +} + +template +inline bool BitReader::GetValue(int num_bits, T* v) { + DCHECK_LE(num_bits, 64); + DCHECK_LE(num_bits, sizeof(T) * 8); + + if (PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; + + *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_; + + bit_offset_ += num_bits; + if (bit_offset_ >= 64) { + byte_offset_ += 8; + bit_offset_ -= 64; + BufferValues(); + // Read bits of v that crossed into new buffered_values_ + *v |= BitUtil::ShiftLeftZeroOnOverflow( + BitUtil::TrailingBits(buffered_values_, bit_offset_), + (num_bits - bit_offset_)); + } + DCHECK_LE(bit_offset_, 64); + return true; +} + +inline void BitReader::Rewind(int num_bits) { + bit_offset_ -= num_bits; + if (bit_offset_ >= 0) { + return; + } + while (bit_offset_ < 0) { + int seek_back = std::min(byte_offset_, 8); + byte_offset_ -= seek_back; + bit_offset_ += seek_back * 8; + } + // This should only be executed *if* rewinding by 'num_bits' + // make the existing buffered_values_ invalid + DCHECK_GE(byte_offset_, 0); // Check for underflow + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); +} + +inline void BitReader::SeekToBit(uint stream_position) { + DCHECK_LE(stream_position, max_bytes_ * 8); + + int delta = static_cast(stream_position) - position(); + if (delta == 0) { + return; + } else if (delta < 0) { + Rewind(position() - stream_position); + } else { + bit_offset_ += delta; + while (bit_offset_ >= 64) { + byte_offset_ +=8; + bit_offset_ -= 64; + if (bit_offset_ < 64) { + // This should only be executed if seeking to + // 'stream_position' makes the existing buffered_values_ + // invalid. + BufferValues(); + } + } + } +} + +template +inline bool BitReader::GetAligned(int num_bytes, T* v) { + DCHECK_LE(num_bytes, sizeof(T)); + int bytes_read = BitUtil::Ceil(bit_offset_, 8); + if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; + + // Advance byte_offset to next unread byte and read num_bytes + byte_offset_ += bytes_read; + memcpy(v, buffer_ + byte_offset_, num_bytes); + byte_offset_ += num_bytes; + + // Reset buffered_values_ + bit_offset_ = 0; + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } + return true; +} + +inline bool BitReader::GetVlqInt(int32_t* v) { + *v = 0; + int shift = 0; + int num_bytes = 0; + uint8_t byte = 0; + do { + if (!GetAligned(1, &byte)) return false; + *v |= (byte & 0x7F) << shift; + shift += 7; + DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); + } while ((byte & 0x80) != 0); + return true; +} + +} // namespace doris + +#endif diff --git a/be/src/util/faststring.cc b/be/src/util/faststring.cc index 30febe970589dc..49f868704c0b47 100644 --- a/be/src/util/faststring.cc +++ b/be/src/util/faststring.cc @@ -1,72 +1,72 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/faststring.h" - -#include -#include - -namespace doris { - -void faststring::GrowByAtLeast(size_t count) { - // Not enough space, need to reserve more. - // Don't reserve exactly enough space for the new string -- that makes it - // too easy to write perf bugs where you get O(n^2) append. - // Instead, alwayhs expand by at least 50%. - - size_t to_reserve = len_ + count; - if (len_ + count < len_ * 3 / 2) { - to_reserve = len_ * 3 / 2; - } - GrowArray(to_reserve); -} - -void faststring::GrowArray(size_t newcapacity) { - DCHECK_GE(newcapacity, capacity_); - std::unique_ptr newdata(new uint8_t[newcapacity]); - if (len_ > 0) { - memcpy(&newdata[0], &data_[0], len_); - } - capacity_ = newcapacity; - if (data_ != initial_data_) { - delete[] data_; - } else { - ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); - } - - data_ = newdata.release(); - ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); -} - -void faststring::ShrinkToFitInternal() { - DCHECK_NE(data_, initial_data_); - if (len_ <= kInitialCapacity) { - ASAN_UNPOISON_MEMORY_REGION(initial_data_, len_); - memcpy(initial_data_, &data_[0], len_); - delete[] data_; - data_ = initial_data_; - capacity_ = kInitialCapacity; - } else { - std::unique_ptr newdata(new uint8_t[len_]); - memcpy(&newdata[0], &data_[0], len_); - delete[] data_; - data_ = newdata.release(); - capacity_ = len_; - } -} - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/faststring.h" + +#include +#include + +namespace doris { + +void faststring::GrowByAtLeast(size_t count) { + // Not enough space, need to reserve more. + // Don't reserve exactly enough space for the new string -- that makes it + // too easy to write perf bugs where you get O(n^2) append. + // Instead, alwayhs expand by at least 50%. + + size_t to_reserve = len_ + count; + if (len_ + count < len_ * 3 / 2) { + to_reserve = len_ * 3 / 2; + } + GrowArray(to_reserve); +} + +void faststring::GrowArray(size_t newcapacity) { + DCHECK_GE(newcapacity, capacity_); + std::unique_ptr newdata(new uint8_t[newcapacity]); + if (len_ > 0) { + memcpy(&newdata[0], &data_[0], len_); + } + capacity_ = newcapacity; + if (data_ != initial_data_) { + delete[] data_; + } else { + ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + } + + data_ = newdata.release(); + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); +} + +void faststring::ShrinkToFitInternal() { + DCHECK_NE(data_, initial_data_); + if (len_ <= kInitialCapacity) { + ASAN_UNPOISON_MEMORY_REGION(initial_data_, len_); + memcpy(initial_data_, &data_[0], len_); + delete[] data_; + data_ = initial_data_; + capacity_ = kInitialCapacity; + } else { + std::unique_ptr newdata(new uint8_t[len_]); + memcpy(&newdata[0], &data_[0], len_); + delete[] data_; + data_ = newdata.release(); + capacity_ = len_; + } +} + +} // namespace doris diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h index f3892f41709d55..98dfbb7828a841 100644 --- a/be/src/util/faststring.h +++ b/be/src/util/faststring.h @@ -1,257 +1,257 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "gutil/dynamic_annotations.h" -#include "gutil/macros.h" -#include "gutil/port.h" -#include "gutil/strings/fastmem.h" - -namespace doris { - -// A faststring is similar to a std::string, except that it is faster for many -// common use cases (in particular, resize() will fill with uninitialized data -// instead of memsetting to \0) -class faststring { - public: - enum { - kInitialCapacity = 32 - }; - - faststring() : - data_(initial_data_), - len_(0), - capacity_(kInitialCapacity) { - } - - // Construct a string with the given capacity, in bytes. - explicit faststring(size_t capacity) - : data_(initial_data_), - len_(0), - capacity_(kInitialCapacity) { - if (capacity > capacity_) { - data_ = new uint8_t[capacity]; - capacity_ = capacity; - } - ASAN_POISON_MEMORY_REGION(data_, capacity_); - } - - ~faststring() { - ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); - if (data_ != initial_data_) { - delete[] data_; - } - } - - // Reset the valid length of the string to 0. - // - // This does not free up any memory. The capacity of the string remains unchanged. - void clear() { - resize(0); - ASAN_POISON_MEMORY_REGION(data_, capacity_); - } - - // Resize the string to the given length. - // If the new length is larger than the old length, the capacity is expanded as necessary. - // - // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are - // not cleared. - void resize(size_t newsize) { - if (newsize > capacity_) { - reserve(newsize); - } - len_ = newsize; - ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); - ASAN_UNPOISON_MEMORY_REGION(data_, len_); - } - - // Releases the underlying array; after this, the buffer is left empty. - // - // NOTE: the data pointer returned by release() is not necessarily the pointer - uint8_t *release() { - uint8_t *ret = data_; - if (ret == initial_data_) { - ret = new uint8_t[len_]; - memcpy(ret, data_, len_); - } - len_ = 0; - capacity_ = kInitialCapacity; - data_ = initial_data_; - ASAN_POISON_MEMORY_REGION(data_, capacity_); - return ret; - } - - // Reserve space for the given total amount of data. If the current capacity is already - // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). - // - // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory - // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. - void reserve(size_t newcapacity) { - if (PREDICT_TRUE(newcapacity <= capacity_)) return; - GrowArray(newcapacity); - } - - // Append the given data to the string, resizing capacity as necessary. - void append(const void *src_v, size_t count) { - const uint8_t *src = reinterpret_cast(src_v); - EnsureRoomForAppend(count); - ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); - - // appending short values is common enough that this - // actually helps, according to benchmarks. In theory - // memcpy_inlined should already be just as good, but this - // was ~20% faster for reading a large prefix-coded string file - // where each string was only a few chars different - if (count <= 4) { - uint8_t *p = &data_[len_]; - for (int i = 0; i < count; i++) { - *p++ = *src++; - } - } else { - strings::memcpy_inlined(&data_[len_], src, count); - } - len_ += count; - } - - // Append the given string to this string. - void append(const std::string &str) { - append(str.data(), str.size()); - } - - // Append the given character to this string. - void push_back(const char byte) { - EnsureRoomForAppend(1); - ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); - data_[len_] = byte; - len_++; - } - - // Return the valid length of this string. - size_t length() const { - return len_; - } - - // Return the valid length of this string (identical to length()) - size_t size() const { - return len_; - } - - // Return the allocated capacity of this string. - size_t capacity() const { - return capacity_; - } - - // Return a pointer to the data in this string. Note that this pointer - // may be invalidated by any later non-const operation. - const uint8_t *data() const { - return &data_[0]; - } - - // Return a pointer to the data in this string. Note that this pointer - // may be invalidated by any later non-const operation. - uint8_t *data() { - return &data_[0]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - const uint8_t &at(size_t i) const { - return data_[i]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - const uint8_t &operator[](size_t i) const { - return data_[i]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - uint8_t &operator[](size_t i) { - return data_[i]; - } - - // Reset the contents of this string by copying 'len' bytes from 'src'. - void assign_copy(const uint8_t *src, size_t len) { - // Reset length so that the first resize doesn't need to copy the current - // contents of the array. - len_ = 0; - resize(len); - memcpy(data(), src, len); - } - - // Reset the contents of this string by copying from the given std::string. - void assign_copy(const std::string &str) { - assign_copy(reinterpret_cast(str.c_str()), - str.size()); - } - - // Reallocates the internal storage to fit only the current data. - // - // This may revert to using internal storage if the current length is shorter than - // kInitialCapacity. Note that, in that case, after this call, capacity() will return - // a capacity larger than the data length. - // - // Any pointers within this instance are invalidated. - void shrink_to_fit() { - if (data_ == initial_data_ || capacity_ == len_) return; - ShrinkToFitInternal(); - } - - // Return a copy of this string as a std::string. - std::string ToString() const { - return std::string(reinterpret_cast(data()), - len_); - } - - private: - DISALLOW_COPY_AND_ASSIGN(faststring); - - // If necessary, expand the buffer to fit at least 'count' more bytes. - // If the array has to be grown, it is grown by at least 50%. - void EnsureRoomForAppend(size_t count) { - if (PREDICT_TRUE(len_ + count <= capacity_)) { - return; - } - - // Call the non-inline slow path - this reduces the number of instructions - // on the hot path. - GrowByAtLeast(count); - } - - // The slow path of MakeRoomFor. Grows the buffer by either - // 'count' bytes, or 50%, whichever is more. - void GrowByAtLeast(size_t count); - - // Grow the array to the given capacity, which must be more than - // the current capacity. - void GrowArray(size_t newcapacity); - - void ShrinkToFitInternal(); - - uint8_t* data_; - uint8_t initial_data_[kInitialCapacity]; - size_t len_; - size_t capacity_; -}; - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "gutil/dynamic_annotations.h" +#include "gutil/macros.h" +#include "gutil/port.h" +#include "gutil/strings/fastmem.h" + +namespace doris { + +// A faststring is similar to a std::string, except that it is faster for many +// common use cases (in particular, resize() will fill with uninitialized data +// instead of memsetting to \0) +class faststring { + public: + enum { + kInitialCapacity = 32 + }; + + faststring() : + data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + } + + // Construct a string with the given capacity, in bytes. + explicit faststring(size_t capacity) + : data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + if (capacity > capacity_) { + data_ = new uint8_t[capacity]; + capacity_ = capacity; + } + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + ~faststring() { + ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + if (data_ != initial_data_) { + delete[] data_; + } + } + + // Reset the valid length of the string to 0. + // + // This does not free up any memory. The capacity of the string remains unchanged. + void clear() { + resize(0); + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + // Resize the string to the given length. + // If the new length is larger than the old length, the capacity is expanded as necessary. + // + // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are + // not cleared. + void resize(size_t newsize) { + if (newsize > capacity_) { + reserve(newsize); + } + len_ = newsize; + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); + ASAN_UNPOISON_MEMORY_REGION(data_, len_); + } + + // Releases the underlying array; after this, the buffer is left empty. + // + // NOTE: the data pointer returned by release() is not necessarily the pointer + uint8_t *release() { + uint8_t *ret = data_; + if (ret == initial_data_) { + ret = new uint8_t[len_]; + memcpy(ret, data_, len_); + } + len_ = 0; + capacity_ = kInitialCapacity; + data_ = initial_data_; + ASAN_POISON_MEMORY_REGION(data_, capacity_); + return ret; + } + + // Reserve space for the given total amount of data. If the current capacity is already + // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). + // + // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory + // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. + void reserve(size_t newcapacity) { + if (PREDICT_TRUE(newcapacity <= capacity_)) return; + GrowArray(newcapacity); + } + + // Append the given data to the string, resizing capacity as necessary. + void append(const void *src_v, size_t count) { + const uint8_t *src = reinterpret_cast(src_v); + EnsureRoomForAppend(count); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); + + // appending short values is common enough that this + // actually helps, according to benchmarks. In theory + // memcpy_inlined should already be just as good, but this + // was ~20% faster for reading a large prefix-coded string file + // where each string was only a few chars different + if (count <= 4) { + uint8_t *p = &data_[len_]; + for (int i = 0; i < count; i++) { + *p++ = *src++; + } + } else { + strings::memcpy_inlined(&data_[len_], src, count); + } + len_ += count; + } + + // Append the given string to this string. + void append(const std::string &str) { + append(str.data(), str.size()); + } + + // Append the given character to this string. + void push_back(const char byte) { + EnsureRoomForAppend(1); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); + data_[len_] = byte; + len_++; + } + + // Return the valid length of this string. + size_t length() const { + return len_; + } + + // Return the valid length of this string (identical to length()) + size_t size() const { + return len_; + } + + // Return the allocated capacity of this string. + size_t capacity() const { + return capacity_; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + const uint8_t *data() const { + return &data_[0]; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + uint8_t *data() { + return &data_[0]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &at(size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &operator[](size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + uint8_t &operator[](size_t i) { + return data_[i]; + } + + // Reset the contents of this string by copying 'len' bytes from 'src'. + void assign_copy(const uint8_t *src, size_t len) { + // Reset length so that the first resize doesn't need to copy the current + // contents of the array. + len_ = 0; + resize(len); + memcpy(data(), src, len); + } + + // Reset the contents of this string by copying from the given std::string. + void assign_copy(const std::string &str) { + assign_copy(reinterpret_cast(str.c_str()), + str.size()); + } + + // Reallocates the internal storage to fit only the current data. + // + // This may revert to using internal storage if the current length is shorter than + // kInitialCapacity. Note that, in that case, after this call, capacity() will return + // a capacity larger than the data length. + // + // Any pointers within this instance are invalidated. + void shrink_to_fit() { + if (data_ == initial_data_ || capacity_ == len_) return; + ShrinkToFitInternal(); + } + + // Return a copy of this string as a std::string. + std::string ToString() const { + return std::string(reinterpret_cast(data()), + len_); + } + + private: + DISALLOW_COPY_AND_ASSIGN(faststring); + + // If necessary, expand the buffer to fit at least 'count' more bytes. + // If the array has to be grown, it is grown by at least 50%. + void EnsureRoomForAppend(size_t count) { + if (PREDICT_TRUE(len_ + count <= capacity_)) { + return; + } + + // Call the non-inline slow path - this reduces the number of instructions + // on the hot path. + GrowByAtLeast(count); + } + + // The slow path of MakeRoomFor. Grows the buffer by either + // 'count' bytes, or 50%, whichever is more. + void GrowByAtLeast(size_t count); + + // Grow the array to the given capacity, which must be more than + // the current capacity. + void GrowArray(size_t newcapacity); + + void ShrinkToFitInternal(); + + uint8_t* data_; + uint8_t initial_data_[kInitialCapacity]; + size_t len_; + size_t capacity_; +}; + +} // namespace doris diff --git a/be/src/util/rle_encoding.h b/be/src/util/rle_encoding.h index 8538bc64605d8f..26b03e1b1f08bc 100644 --- a/be/src/util/rle_encoding.h +++ b/be/src/util/rle_encoding.h @@ -1,521 +1,521 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#pragma once - -#include - -#include "gutil/port.h" -#include "util/bit_stream_utils.inline.h" -#include "util/bit_util.h" - -namespace doris { - -// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs -// are sufficiently long, RLE is used, otherwise, the values are just bit-packed -// (literal encoding). -// For both types of runs, there is a byte-aligned indicator which encodes the length -// of the run and the type of the run. -// This encoding has the benefit that when there aren't any long enough runs, values -// are always decoded at fixed (can be precomputed) bit offsets OR both the value and -// the run length are byte aligned. This allows for very efficient decoding -// implementations. -// The encoding is: -// encoded-block := run* -// run := literal-run | repeated-run -// literal-run := literal-indicator < literal bytes > -// repeated-run := repeated-indicator < repeated value. padded to byte boundary > -// literal-indicator := varint_encode( number_of_groups << 1 | 1) -// repeated-indicator := varint_encode( number_of_repetitions << 1 ) -// -// Each run is preceded by a varint. The varint's least significant bit is -// used to indicate whether the run is a literal run or a repeated run. The rest -// of the varint is used to determine the length of the run (eg how many times the -// value repeats). -// -// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode -// in groups of 8), so that no matter the bit-width of the value, the sequence will end -// on a byte boundary without padding. -// Given that we know it is a multiple of 8, we store the number of 8-groups rather than -// the actual number of encoded ints. (This means that the total number of encoded values -// can not be determined from the encoded data, since the number of values in the last -// group may not be a multiple of 8). -// There is a break-even point when it is more storage efficient to do run length -// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes -// for both the repeated encoding or the literal encoding. This value can always -// be computed based on the bit-width. -// TODO: think about how to use this for strings. The bit packing isn't quite the same. -// -// Examples with bit-width 1 (eg encoding booleans): -// ---------------------------------------- -// 100 1s followed by 100 0s: -// <1, padded to 1 byte> <0, padded to 1 byte> -// - (total 4 bytes) -// -// alternating 1s and 0s (200 total): -// 200 ints = 25 groups of 8 -// <25 bytes of values, bitpacked> -// (total 26 bytes, 1 byte overhead) -// - -// Decoder class for RLE encoded data. -// -// NOTE: the encoded format does not have any length prefix or any other way of -// indicating that the encoded sequence ends at a certain point, so the Decoder -// methods may return some extra bits at the end before the read methods start -// to return 0/false. -template -class RleDecoder { - public: - // Create a decoder object. buffer/buffer_len is the decoded data. - // bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0), - rewind_state_(CANT_REWIND) { - DCHECK_GE(bit_width_, 1); - DCHECK_LE(bit_width_, 64); - } - - RleDecoder() {} - - // Skip n values, and returns the number of non-zero entries skipped. - size_t Skip(size_t to_skip); - - // Gets the next value. Returns false if there are no more. - bool Get(T* val); - - // Seek to the previous value. - void RewindOne(); - - // Gets the next run of the same 'val'. Returns 0 if there is no - // more data to be decoded. Will return a run of at most 'max_run' - // values. If there are more values than this, the next call to - // GetNextRun will return more from the same run. - size_t GetNextRun(T* val, size_t max_run); - - private: - bool ReadHeader(); - - enum RewindState { - REWIND_LITERAL, - REWIND_RUN, - CANT_REWIND - }; - - BitReader bit_reader_; - int bit_width_; - uint64_t current_value_; - uint32_t repeat_count_; - uint32_t literal_count_; - RewindState rewind_state_; -}; - -// Class to incrementally build the rle data. -// The encoding has two modes: encoding repeated runs and literal runs. -// If the run is sufficiently short, it is more efficient to encode as a literal run. -// This class does so by buffering 8 values at a time. If they are not all the same -// they are added to the literal run. If they are the same, they are added to the -// repeated run. When we switch modes, the previous run is flushed out. -template -class RleEncoder { - public: - // buffer: buffer to write bits to. - // bit_width: max number of bits for value. - // TODO: consider adding a min_repeated_run_length so the caller can control - // when values should be encoded as repeated runs. Currently this is derived - // based on the bit_width, which can determine a storage optimal choice. - explicit RleEncoder(faststring *buffer, int bit_width) - : bit_width_(bit_width), - bit_writer_(buffer) { - DCHECK_GE(bit_width_, 1); - DCHECK_LE(bit_width_, 64); - Clear(); - } - - // Reserve 'num_bytes' bytes for a plain encoded header, set each - // byte with 'val': this is used for the RLE-encoded data blocks in - // order to be able to able to store the initial ordinal position - // and number of elements. This is a part of RleEncoder in order to - // maintain the correct offset in 'buffer'. - void Reserve(int num_bytes, uint8_t val); - - // Encode value. This value must be representable with bit_width_ bits. - void Put(T value, size_t run_length = 1); - - // Flushes any pending values to the underlying buffer. - // Returns the total number of bytes written - int Flush(); - - // Resets all the state in the encoder. - void Clear(); - - int32_t len() const { return bit_writer_.bytes_written(); } - - private: - // Flushes any buffered values. If this is part of a repeated run, this is largely - // a no-op. - // If it is part of a literal run, this will call FlushLiteralRun, which writes - // out the buffered literal values. - // If 'done' is true, the current run would be written even if it would normally - // have been buffered more. This should only be called at the end, when the - // encoder has received all values even if it would normally continue to be - // buffered. - void FlushBufferedValues(bool done); - - // Flushes literal values to the underlying buffer. If update_indicator_byte, - // then the current literal run is complete and the indicator byte is updated. - void FlushLiteralRun(bool update_indicator_byte); - - // Flushes a repeated run to the underlying buffer. - void FlushRepeatedRun(); - - // Number of bits needed to encode the value. - const int bit_width_; - - // Underlying buffer. - BitWriter bit_writer_; - - // We need to buffer at most 8 values for literals. This happens when the - // bit_width is 1 (so 8 values fit in one byte). - // TODO: generalize this to other bit widths - uint64_t buffered_values_[8]; - - // Number of values in buffered_values_ - int num_buffered_values_; - - // The current (also last) value that was written and the count of how - // many times in a row that value has been seen. This is maintained even - // if we are in a literal run. If the repeat_count_ get high enough, we switch - // to encoding repeated runs. - uint64_t current_value_; - int repeat_count_; - - // Number of literals in the current run. This does not include the literals - // that might be in buffered_values_. Only after we've got a group big enough - // can we decide if they should part of the literal_count_ or repeat_count_ - int literal_count_; - - // Index of a byte in the underlying buffer that stores the indicator byte. - // This is reserved as soon as we need a literal run but the value is written - // when the literal run is complete. We maintain an index rather than a pointer - // into the underlying buffer because the pointer value may become invalid if - // the underlying buffer is resized. - int literal_indicator_byte_idx_; -}; - -template -inline bool RleDecoder::ReadHeader() { - DCHECK(bit_reader_.is_initialized()); - if (PREDICT_FALSE(literal_count_ == 0 && repeat_count_ == 0)) { - // Read the next run's indicator int, it could be a literal or repeated run - // The int is encoded as a vlq-encoded value. - int32_t indicator_value = 0; - bool result = bit_reader_.GetVlqInt(&indicator_value); - if (PREDICT_FALSE(!result)) { - return false; - } - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - if (is_literal) { - literal_count_ = (indicator_value >> 1) * 8; - DCHECK_GT(literal_count_, 0); - } else { - repeat_count_ = indicator_value >> 1; - DCHECK_GT(repeat_count_, 0); - bool result = bit_reader_.GetAligned( - BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); - DCHECK(result); - } - } - return true; -} - -template -inline bool RleDecoder::Get(T* val) { - DCHECK(bit_reader_.is_initialized()); - if (PREDICT_FALSE(!ReadHeader())) { - return false; - } - - if (PREDICT_TRUE(repeat_count_ > 0)) { - *val = current_value_; - --repeat_count_; - rewind_state_ = REWIND_RUN; - } else { - DCHECK(literal_count_ > 0); - bool result = bit_reader_.GetValue(bit_width_, val); - DCHECK(result); - --literal_count_; - rewind_state_ = REWIND_LITERAL; - } - - return true; -} - -template -inline void RleDecoder::RewindOne() { - DCHECK(bit_reader_.is_initialized()); - - switch (rewind_state_) { - case CANT_REWIND: - LOG(FATAL) << "Can't rewind more than once after each read!"; - break; - case REWIND_RUN: - ++repeat_count_; - break; - case REWIND_LITERAL: - { - bit_reader_.Rewind(bit_width_); - ++literal_count_; - break; - } - } - - rewind_state_ = CANT_REWIND; -} - -template -inline size_t RleDecoder::GetNextRun(T* val, size_t max_run) { - DCHECK(bit_reader_.is_initialized()); - DCHECK_GT(max_run, 0); - size_t ret = 0; - size_t rem = max_run; - while (ReadHeader()) { - if (PREDICT_TRUE(repeat_count_ > 0)) { - if (PREDICT_FALSE(ret > 0 && *val != current_value_)) { - return ret; - } - *val = current_value_; - if (repeat_count_ >= rem) { - // The next run is longer than the amount of remaining data - // that the caller wants to read. Only consume it partially. - repeat_count_ -= rem; - ret += rem; - return ret; - } - ret += repeat_count_; - rem -= repeat_count_; - repeat_count_ = 0; - } else { - DCHECK(literal_count_ > 0); - if (ret == 0) { - bool has_more = bit_reader_.GetValue(bit_width_, val); - DCHECK(has_more); - literal_count_--; - ret++; - rem--; - } - - while (literal_count_ > 0) { - bool result = bit_reader_.GetValue(bit_width_, ¤t_value_); - DCHECK(result); - if (current_value_ != *val || rem == 0) { - bit_reader_.Rewind(bit_width_); - return ret; - } - ret++; - rem--; - literal_count_--; - } - } - } - return ret; - } - -template -inline size_t RleDecoder::Skip(size_t to_skip) { - DCHECK(bit_reader_.is_initialized()); - - size_t set_count = 0; - while (to_skip > 0) { - bool result = ReadHeader(); - DCHECK(result); - - if (PREDICT_TRUE(repeat_count_ > 0)) { - size_t nskip = (repeat_count_ < to_skip) ? repeat_count_ : to_skip; - repeat_count_ -= nskip; - to_skip -= nskip; - if (current_value_ != 0) { - set_count += nskip; - } - } else { - DCHECK(literal_count_ > 0); - size_t nskip = (literal_count_ < to_skip) ? literal_count_ : to_skip; - literal_count_ -= nskip; - to_skip -= nskip; - for (; nskip > 0; nskip--) { - T value = 0; - bool result = bit_reader_.GetValue(bit_width_, &value); - DCHECK(result); - if (value != 0) { - set_count++; - } - } - } - } - return set_count; -} - -// This function buffers input values 8 at a time. After seeing all 8 values, -// it decides whether they should be encoded as a literal or repeated run. -template -inline void RleEncoder::Put(T value, size_t run_length) { - DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); - - // TODO(perf): remove the loop and use the repeat_count_ - for (; run_length > 0; run_length--) { - if (PREDICT_TRUE(current_value_ == value)) { - ++repeat_count_; - if (repeat_count_ > 8) { - // This is just a continuation of the current run, no need to buffer the - // values. - // Note that this is the fast path for long repeated runs. - continue; - } - } else { - if (repeat_count_ >= 8) { - // We had a run that was long enough but it has ended. Flush the - // current repeated run. - DCHECK_EQ(literal_count_, 0); - FlushRepeatedRun(); - } - repeat_count_ = 1; - current_value_ = value; - } - - buffered_values_[num_buffered_values_] = value; - if (++num_buffered_values_ == 8) { - DCHECK_EQ(literal_count_ % 8, 0); - FlushBufferedValues(false); - } - } -} - -template -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { - if (literal_indicator_byte_idx_ < 0) { - // The literal indicator byte has not been reserved yet, get one now. - literal_indicator_byte_idx_ = bit_writer_.GetByteIndexAndAdvance(1); - DCHECK_GE(literal_indicator_byte_idx_, 0); - } - - // Write all the buffered values as bit packed literals - for (int i = 0; i < num_buffered_values_; ++i) { - bit_writer_.PutValue(buffered_values_[i], bit_width_); - } - num_buffered_values_ = 0; - - if (update_indicator_byte) { - // At this point we need to write the indicator byte for the literal run. - // We only reserve one byte, to allow for streaming writes of literal values. - // The logic makes sure we flush literal runs often enough to not overrun - // the 1 byte. - int num_groups = BitUtil::Ceil(literal_count_, 8); - int32_t indicator_value = (num_groups << 1) | 1; - DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); - bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value; - literal_indicator_byte_idx_ = -1; - literal_count_ = 0; - } -} - -template -inline void RleEncoder::FlushRepeatedRun() { - DCHECK_GT(repeat_count_, 0); - // The lsb of 0 indicates this is a repeated run - int32_t indicator_value = repeat_count_ << 1 | 0; - bit_writer_.PutVlqInt(indicator_value); - bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); - num_buffered_values_ = 0; - repeat_count_ = 0; -} - -// Flush the values that have been buffered. At this point we decide whether -// we need to switch between the run types or continue the current one. -template -inline void RleEncoder::FlushBufferedValues(bool done) { - if (repeat_count_ >= 8) { - // Clear the buffered values. They are part of the repeated run now and we - // don't want to flush them out as literals. - num_buffered_values_ = 0; - if (literal_count_ != 0) { - // There was a current literal run. All the values in it have been flushed - // but we still need to update the indicator byte. - DCHECK_EQ(literal_count_ % 8, 0); - DCHECK_EQ(repeat_count_, 8); - FlushLiteralRun(true); - } - DCHECK_EQ(literal_count_, 0); - return; - } - - literal_count_ += num_buffered_values_; - int num_groups = BitUtil::Ceil(literal_count_, 8); - if (num_groups + 1 >= (1 << 6)) { - // We need to start a new literal run because the indicator byte we've reserved - // cannot store more values. - DCHECK_GE(literal_indicator_byte_idx_, 0); - FlushLiteralRun(true); - } else { - FlushLiteralRun(done); - } - repeat_count_ = 0; -} - -template -inline void RleEncoder::Reserve(int num_bytes, uint8_t val) { - for (int i = 0; i < num_bytes; ++i) { - bit_writer_.PutValue(val, 8); - } -} - -template -inline int RleEncoder::Flush() { - if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { - bool all_repeat = literal_count_ == 0 && - (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); - // There is something pending, figure out if it's a repeated or literal run - if (repeat_count_ > 0 && all_repeat) { - FlushRepeatedRun(); - } else { - literal_count_ += num_buffered_values_; - FlushLiteralRun(true); - repeat_count_ = 0; - } - } - bit_writer_.Flush(); - DCHECK_EQ(num_buffered_values_, 0); - DCHECK_EQ(literal_count_, 0); - DCHECK_EQ(repeat_count_, 0); - return bit_writer_.bytes_written(); -} - -template -inline void RleEncoder::Clear() { - current_value_ = 0; - repeat_count_ = 0; - num_buffered_values_ = 0; - literal_count_ = 0; - literal_indicator_byte_idx_ = -1; - bit_writer_.Clear(); -} - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include + +#include "gutil/port.h" +#include "util/bit_stream_utils.inline.h" +#include "util/bit_util.h" + +namespace doris { + +// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs +// are sufficiently long, RLE is used, otherwise, the values are just bit-packed +// (literal encoding). +// For both types of runs, there is a byte-aligned indicator which encodes the length +// of the run and the type of the run. +// This encoding has the benefit that when there aren't any long enough runs, values +// are always decoded at fixed (can be precomputed) bit offsets OR both the value and +// the run length are byte aligned. This allows for very efficient decoding +// implementations. +// The encoding is: +// encoded-block := run* +// run := literal-run | repeated-run +// literal-run := literal-indicator < literal bytes > +// repeated-run := repeated-indicator < repeated value. padded to byte boundary > +// literal-indicator := varint_encode( number_of_groups << 1 | 1) +// repeated-indicator := varint_encode( number_of_repetitions << 1 ) +// +// Each run is preceded by a varint. The varint's least significant bit is +// used to indicate whether the run is a literal run or a repeated run. The rest +// of the varint is used to determine the length of the run (eg how many times the +// value repeats). +// +// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode +// in groups of 8), so that no matter the bit-width of the value, the sequence will end +// on a byte boundary without padding. +// Given that we know it is a multiple of 8, we store the number of 8-groups rather than +// the actual number of encoded ints. (This means that the total number of encoded values +// can not be determined from the encoded data, since the number of values in the last +// group may not be a multiple of 8). +// There is a break-even point when it is more storage efficient to do run length +// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes +// for both the repeated encoding or the literal encoding. This value can always +// be computed based on the bit-width. +// TODO: think about how to use this for strings. The bit packing isn't quite the same. +// +// Examples with bit-width 1 (eg encoding booleans): +// ---------------------------------------- +// 100 1s followed by 100 0s: +// <1, padded to 1 byte> <0, padded to 1 byte> +// - (total 4 bytes) +// +// alternating 1s and 0s (200 total): +// 200 ints = 25 groups of 8 +// <25 bytes of values, bitpacked> +// (total 26 bytes, 1 byte overhead) +// + +// Decoder class for RLE encoded data. +// +// NOTE: the encoded format does not have any length prefix or any other way of +// indicating that the encoded sequence ends at a certain point, so the Decoder +// methods may return some extra bits at the end before the read methods start +// to return 0/false. +template +class RleDecoder { + public: + // Create a decoder object. buffer/buffer_len is the decoded data. + // bit_width is the width of each value (before encoding). + RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + : bit_reader_(buffer, buffer_len), + bit_width_(bit_width), + current_value_(0), + repeat_count_(0), + literal_count_(0), + rewind_state_(CANT_REWIND) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + } + + RleDecoder() {} + + // Skip n values, and returns the number of non-zero entries skipped. + size_t Skip(size_t to_skip); + + // Gets the next value. Returns false if there are no more. + bool Get(T* val); + + // Seek to the previous value. + void RewindOne(); + + // Gets the next run of the same 'val'. Returns 0 if there is no + // more data to be decoded. Will return a run of at most 'max_run' + // values. If there are more values than this, the next call to + // GetNextRun will return more from the same run. + size_t GetNextRun(T* val, size_t max_run); + + private: + bool ReadHeader(); + + enum RewindState { + REWIND_LITERAL, + REWIND_RUN, + CANT_REWIND + }; + + BitReader bit_reader_; + int bit_width_; + uint64_t current_value_; + uint32_t repeat_count_; + uint32_t literal_count_; + RewindState rewind_state_; +}; + +// Class to incrementally build the rle data. +// The encoding has two modes: encoding repeated runs and literal runs. +// If the run is sufficiently short, it is more efficient to encode as a literal run. +// This class does so by buffering 8 values at a time. If they are not all the same +// they are added to the literal run. If they are the same, they are added to the +// repeated run. When we switch modes, the previous run is flushed out. +template +class RleEncoder { + public: + // buffer: buffer to write bits to. + // bit_width: max number of bits for value. + // TODO: consider adding a min_repeated_run_length so the caller can control + // when values should be encoded as repeated runs. Currently this is derived + // based on the bit_width, which can determine a storage optimal choice. + explicit RleEncoder(faststring *buffer, int bit_width) + : bit_width_(bit_width), + bit_writer_(buffer) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + Clear(); + } + + // Reserve 'num_bytes' bytes for a plain encoded header, set each + // byte with 'val': this is used for the RLE-encoded data blocks in + // order to be able to able to store the initial ordinal position + // and number of elements. This is a part of RleEncoder in order to + // maintain the correct offset in 'buffer'. + void Reserve(int num_bytes, uint8_t val); + + // Encode value. This value must be representable with bit_width_ bits. + void Put(T value, size_t run_length = 1); + + // Flushes any pending values to the underlying buffer. + // Returns the total number of bytes written + int Flush(); + + // Resets all the state in the encoder. + void Clear(); + + int32_t len() const { return bit_writer_.bytes_written(); } + + private: + // Flushes any buffered values. If this is part of a repeated run, this is largely + // a no-op. + // If it is part of a literal run, this will call FlushLiteralRun, which writes + // out the buffered literal values. + // If 'done' is true, the current run would be written even if it would normally + // have been buffered more. This should only be called at the end, when the + // encoder has received all values even if it would normally continue to be + // buffered. + void FlushBufferedValues(bool done); + + // Flushes literal values to the underlying buffer. If update_indicator_byte, + // then the current literal run is complete and the indicator byte is updated. + void FlushLiteralRun(bool update_indicator_byte); + + // Flushes a repeated run to the underlying buffer. + void FlushRepeatedRun(); + + // Number of bits needed to encode the value. + const int bit_width_; + + // Underlying buffer. + BitWriter bit_writer_; + + // We need to buffer at most 8 values for literals. This happens when the + // bit_width is 1 (so 8 values fit in one byte). + // TODO: generalize this to other bit widths + uint64_t buffered_values_[8]; + + // Number of values in buffered_values_ + int num_buffered_values_; + + // The current (also last) value that was written and the count of how + // many times in a row that value has been seen. This is maintained even + // if we are in a literal run. If the repeat_count_ get high enough, we switch + // to encoding repeated runs. + uint64_t current_value_; + int repeat_count_; + + // Number of literals in the current run. This does not include the literals + // that might be in buffered_values_. Only after we've got a group big enough + // can we decide if they should part of the literal_count_ or repeat_count_ + int literal_count_; + + // Index of a byte in the underlying buffer that stores the indicator byte. + // This is reserved as soon as we need a literal run but the value is written + // when the literal run is complete. We maintain an index rather than a pointer + // into the underlying buffer because the pointer value may become invalid if + // the underlying buffer is resized. + int literal_indicator_byte_idx_; +}; + +template +inline bool RleDecoder::ReadHeader() { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(literal_count_ == 0 && repeat_count_ == 0)) { + // Read the next run's indicator int, it could be a literal or repeated run + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (PREDICT_FALSE(!result)) { + return false; + } + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + DCHECK_GT(literal_count_, 0); + } else { + repeat_count_ = indicator_value >> 1; + DCHECK_GT(repeat_count_, 0); + bool result = bit_reader_.GetAligned( + BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); + DCHECK(result); + } + } + return true; +} + +template +inline bool RleDecoder::Get(T* val) { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(!ReadHeader())) { + return false; + } + + if (PREDICT_TRUE(repeat_count_ > 0)) { + *val = current_value_; + --repeat_count_; + rewind_state_ = REWIND_RUN; + } else { + DCHECK(literal_count_ > 0); + bool result = bit_reader_.GetValue(bit_width_, val); + DCHECK(result); + --literal_count_; + rewind_state_ = REWIND_LITERAL; + } + + return true; +} + +template +inline void RleDecoder::RewindOne() { + DCHECK(bit_reader_.is_initialized()); + + switch (rewind_state_) { + case CANT_REWIND: + LOG(FATAL) << "Can't rewind more than once after each read!"; + break; + case REWIND_RUN: + ++repeat_count_; + break; + case REWIND_LITERAL: + { + bit_reader_.Rewind(bit_width_); + ++literal_count_; + break; + } + } + + rewind_state_ = CANT_REWIND; +} + +template +inline size_t RleDecoder::GetNextRun(T* val, size_t max_run) { + DCHECK(bit_reader_.is_initialized()); + DCHECK_GT(max_run, 0); + size_t ret = 0; + size_t rem = max_run; + while (ReadHeader()) { + if (PREDICT_TRUE(repeat_count_ > 0)) { + if (PREDICT_FALSE(ret > 0 && *val != current_value_)) { + return ret; + } + *val = current_value_; + if (repeat_count_ >= rem) { + // The next run is longer than the amount of remaining data + // that the caller wants to read. Only consume it partially. + repeat_count_ -= rem; + ret += rem; + return ret; + } + ret += repeat_count_; + rem -= repeat_count_; + repeat_count_ = 0; + } else { + DCHECK(literal_count_ > 0); + if (ret == 0) { + bool has_more = bit_reader_.GetValue(bit_width_, val); + DCHECK(has_more); + literal_count_--; + ret++; + rem--; + } + + while (literal_count_ > 0) { + bool result = bit_reader_.GetValue(bit_width_, ¤t_value_); + DCHECK(result); + if (current_value_ != *val || rem == 0) { + bit_reader_.Rewind(bit_width_); + return ret; + } + ret++; + rem--; + literal_count_--; + } + } + } + return ret; + } + +template +inline size_t RleDecoder::Skip(size_t to_skip) { + DCHECK(bit_reader_.is_initialized()); + + size_t set_count = 0; + while (to_skip > 0) { + bool result = ReadHeader(); + DCHECK(result); + + if (PREDICT_TRUE(repeat_count_ > 0)) { + size_t nskip = (repeat_count_ < to_skip) ? repeat_count_ : to_skip; + repeat_count_ -= nskip; + to_skip -= nskip; + if (current_value_ != 0) { + set_count += nskip; + } + } else { + DCHECK(literal_count_ > 0); + size_t nskip = (literal_count_ < to_skip) ? literal_count_ : to_skip; + literal_count_ -= nskip; + to_skip -= nskip; + for (; nskip > 0; nskip--) { + T value = 0; + bool result = bit_reader_.GetValue(bit_width_, &value); + DCHECK(result); + if (value != 0) { + set_count++; + } + } + } + } + return set_count; +} + +// This function buffers input values 8 at a time. After seeing all 8 values, +// it decides whether they should be encoded as a literal or repeated run. +template +inline void RleEncoder::Put(T value, size_t run_length) { + DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); + + // TODO(perf): remove the loop and use the repeat_count_ + for (; run_length > 0; run_length--) { + if (PREDICT_TRUE(current_value_ == value)) { + ++repeat_count_; + if (repeat_count_ > 8) { + // This is just a continuation of the current run, no need to buffer the + // values. + // Note that this is the fast path for long repeated runs. + continue; + } + } else { + if (repeat_count_ >= 8) { + // We had a run that was long enough but it has ended. Flush the + // current repeated run. + DCHECK_EQ(literal_count_, 0); + FlushRepeatedRun(); + } + repeat_count_ = 1; + current_value_ = value; + } + + buffered_values_[num_buffered_values_] = value; + if (++num_buffered_values_ == 8) { + DCHECK_EQ(literal_count_ % 8, 0); + FlushBufferedValues(false); + } + } +} + +template +inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { + if (literal_indicator_byte_idx_ < 0) { + // The literal indicator byte has not been reserved yet, get one now. + literal_indicator_byte_idx_ = bit_writer_.GetByteIndexAndAdvance(1); + DCHECK_GE(literal_indicator_byte_idx_, 0); + } + + // Write all the buffered values as bit packed literals + for (int i = 0; i < num_buffered_values_; ++i) { + bit_writer_.PutValue(buffered_values_[i], bit_width_); + } + num_buffered_values_ = 0; + + if (update_indicator_byte) { + // At this point we need to write the indicator byte for the literal run. + // We only reserve one byte, to allow for streaming writes of literal values. + // The logic makes sure we flush literal runs often enough to not overrun + // the 1 byte. + int num_groups = BitUtil::Ceil(literal_count_, 8); + int32_t indicator_value = (num_groups << 1) | 1; + DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); + bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value; + literal_indicator_byte_idx_ = -1; + literal_count_ = 0; + } +} + +template +inline void RleEncoder::FlushRepeatedRun() { + DCHECK_GT(repeat_count_, 0); + // The lsb of 0 indicates this is a repeated run + int32_t indicator_value = repeat_count_ << 1 | 0; + bit_writer_.PutVlqInt(indicator_value); + bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); + num_buffered_values_ = 0; + repeat_count_ = 0; +} + +// Flush the values that have been buffered. At this point we decide whether +// we need to switch between the run types or continue the current one. +template +inline void RleEncoder::FlushBufferedValues(bool done) { + if (repeat_count_ >= 8) { + // Clear the buffered values. They are part of the repeated run now and we + // don't want to flush them out as literals. + num_buffered_values_ = 0; + if (literal_count_ != 0) { + // There was a current literal run. All the values in it have been flushed + // but we still need to update the indicator byte. + DCHECK_EQ(literal_count_ % 8, 0); + DCHECK_EQ(repeat_count_, 8); + FlushLiteralRun(true); + } + DCHECK_EQ(literal_count_, 0); + return; + } + + literal_count_ += num_buffered_values_; + int num_groups = BitUtil::Ceil(literal_count_, 8); + if (num_groups + 1 >= (1 << 6)) { + // We need to start a new literal run because the indicator byte we've reserved + // cannot store more values. + DCHECK_GE(literal_indicator_byte_idx_, 0); + FlushLiteralRun(true); + } else { + FlushLiteralRun(done); + } + repeat_count_ = 0; +} + +template +inline void RleEncoder::Reserve(int num_bytes, uint8_t val) { + for (int i = 0; i < num_bytes; ++i) { + bit_writer_.PutValue(val, 8); + } +} + +template +inline int RleEncoder::Flush() { + if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { + bool all_repeat = literal_count_ == 0 && + (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); + // There is something pending, figure out if it's a repeated or literal run + if (repeat_count_ > 0 && all_repeat) { + FlushRepeatedRun(); + } else { + literal_count_ += num_buffered_values_; + FlushLiteralRun(true); + repeat_count_ = 0; + } + } + bit_writer_.Flush(); + DCHECK_EQ(num_buffered_values_, 0); + DCHECK_EQ(literal_count_, 0); + DCHECK_EQ(repeat_count_, 0); + return bit_writer_.bytes_written(); +} + +template +inline void RleEncoder::Clear() { + current_value_ = 0; + repeat_count_ = 0; + num_buffered_values_ = 0; + literal_count_ = 0; + literal_indicator_byte_idx_ = -1; + bit_writer_.Clear(); +} + +} // namespace doris diff --git a/be/test/exec/es_scan_node_test.cpp b/be/test/exec/es_scan_node_test.cpp index 77f2cb7cf69454..0f6eab51aba1f6 100644 --- a/be/test/exec/es_scan_node_test.cpp +++ b/be/test/exec/es_scan_node_test.cpp @@ -1,154 +1,154 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "common/object_pool.h" -#include "exec/es_scan_node.h" -#include "gen_cpp/PlanNodes_types.h" -#include "runtime/mem_pool.h" -#include "runtime/descriptors.h" -#include "runtime/runtime_state.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "util/runtime_profile.h" -#include "util/debug_util.h" - -using std::vector; - -namespace doris { - -// mock -class EsScanNodeTest : public testing::Test { -public: - EsScanNodeTest() : _runtime_state(TQueryGlobals()) { - _runtime_state._instance_mem_tracker.reset(new MemTracker()); - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::ES_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_table_desc.__isset.esTable = true; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - // TSlotDescriptor - int offset = 1; - int i = 0; - // id - { - TSlotDescriptor t_slot_desc; - t_slot_desc.__set_slotType(TypeDescriptor(TYPE_INT).to_thrift()); - t_slot_desc.__set_columnPos(i); - t_slot_desc.__set_byteOffset(offset); - t_slot_desc.__set_nullIndicatorByte(0); - t_slot_desc.__set_nullIndicatorBit(-1); - t_slot_desc.__set_slotIdx(i); - t_slot_desc.__set_isMaterialized(true); - t_desc_table.slotDescriptors.push_back(t_slot_desc); - offset += sizeof(int); - } - - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 0; - t_tuple_desc.byteSize = offset; - t_tuple_desc.numNullBytes = 1; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.__isset.slotDescriptors = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - _runtime_state.set_desc_tbl(_desc_tbl); - - // Node Id - _tnode.node_id = 0; - _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; - _tnode.num_children = 0; - _tnode.limit = -1; - _tnode.row_tuples.push_back(0); - _tnode.nullable_tuples.push_back(false); - _tnode.es_scan_node.tuple_id = 0; - std::map properties; - _tnode.es_scan_node.__set_properties(properties); - _tnode.__isset.es_scan_node = true; - } - -protected: - virtual void SetUp() { - } - virtual void TearDown() { - } - TPlanNode _tnode; - ObjectPool _obj_pool; - DescriptorTbl* _desc_tbl; - RuntimeState _runtime_state; -}; - - -TEST_F(EsScanNodeTest, normal_use) { - EsScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - Status status = scan_node.prepare(&_runtime_state); - ASSERT_TRUE(status.ok()); - TEsScanRange es_scan_range; - es_scan_range.__set_index("index1"); - es_scan_range.__set_type("docs"); - es_scan_range.__set_shard_id(0); - TNetworkAddress es_host; - es_host.__set_hostname("host"); - es_host.__set_port(8200); - std::vector es_hosts; - es_hosts.push_back(es_host); - es_scan_range.__set_es_hosts(es_hosts); - TScanRange scan_range; - scan_range.__set_es_scan_range(es_scan_range); - TScanRangeParams scan_range_params; - scan_range_params.__set_scan_range(scan_range); - std::vector scan_ranges; - scan_ranges.push_back(scan_range_params); - - status = scan_node.set_scan_ranges(scan_ranges); - ASSERT_TRUE(status.ok()); - std::stringstream out; - scan_node.debug_string(1, &out); - LOG(WARNING) << out.str(); - - status = scan_node.open(&_runtime_state); - ASSERT_TRUE(status.ok()); - RowBatch row_batch(scan_node._row_descriptor, _runtime_state.batch_size(), new MemTracker(-1)); - bool eos = false; - status = scan_node.get_next(&_runtime_state, &row_batch, &eos); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(2, row_batch.num_rows()); - ASSERT_TRUE(eos); - - status = scan_node.close(&_runtime_state); - ASSERT_TRUE(status.ok()); -} - -} - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "common/object_pool.h" +#include "exec/es_scan_node.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/mem_pool.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" +#include "runtime/row_batch.h" +#include "runtime/string_value.h" +#include "runtime/tuple_row.h" +#include "util/runtime_profile.h" +#include "util/debug_util.h" + +using std::vector; + +namespace doris { + +// mock +class EsScanNodeTest : public testing::Test { +public: + EsScanNodeTest() : _runtime_state(TQueryGlobals()) { + _runtime_state._instance_mem_tracker.reset(new MemTracker()); + TDescriptorTable t_desc_table; + + // table descriptors + TTableDescriptor t_table_desc; + + t_table_desc.id = 0; + t_table_desc.tableType = TTableType::ES_TABLE; + t_table_desc.numCols = 0; + t_table_desc.numClusteringCols = 0; + t_table_desc.__isset.esTable = true; + t_desc_table.tableDescriptors.push_back(t_table_desc); + t_desc_table.__isset.tableDescriptors = true; + // TSlotDescriptor + int offset = 1; + int i = 0; + // id + { + TSlotDescriptor t_slot_desc; + t_slot_desc.__set_slotType(TypeDescriptor(TYPE_INT).to_thrift()); + t_slot_desc.__set_columnPos(i); + t_slot_desc.__set_byteOffset(offset); + t_slot_desc.__set_nullIndicatorByte(0); + t_slot_desc.__set_nullIndicatorBit(-1); + t_slot_desc.__set_slotIdx(i); + t_slot_desc.__set_isMaterialized(true); + t_desc_table.slotDescriptors.push_back(t_slot_desc); + offset += sizeof(int); + } + + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 0; + t_tuple_desc.byteSize = offset; + t_tuple_desc.numNullBytes = 1; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.__isset.slotDescriptors = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + + DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); + _runtime_state.set_desc_tbl(_desc_tbl); + + // Node Id + _tnode.node_id = 0; + _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; + _tnode.num_children = 0; + _tnode.limit = -1; + _tnode.row_tuples.push_back(0); + _tnode.nullable_tuples.push_back(false); + _tnode.es_scan_node.tuple_id = 0; + std::map properties; + _tnode.es_scan_node.__set_properties(properties); + _tnode.__isset.es_scan_node = true; + } + +protected: + virtual void SetUp() { + } + virtual void TearDown() { + } + TPlanNode _tnode; + ObjectPool _obj_pool; + DescriptorTbl* _desc_tbl; + RuntimeState _runtime_state; +}; + + +TEST_F(EsScanNodeTest, normal_use) { + EsScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); + Status status = scan_node.prepare(&_runtime_state); + ASSERT_TRUE(status.ok()); + TEsScanRange es_scan_range; + es_scan_range.__set_index("index1"); + es_scan_range.__set_type("docs"); + es_scan_range.__set_shard_id(0); + TNetworkAddress es_host; + es_host.__set_hostname("host"); + es_host.__set_port(8200); + std::vector es_hosts; + es_hosts.push_back(es_host); + es_scan_range.__set_es_hosts(es_hosts); + TScanRange scan_range; + scan_range.__set_es_scan_range(es_scan_range); + TScanRangeParams scan_range_params; + scan_range_params.__set_scan_range(scan_range); + std::vector scan_ranges; + scan_ranges.push_back(scan_range_params); + + status = scan_node.set_scan_ranges(scan_ranges); + ASSERT_TRUE(status.ok()); + std::stringstream out; + scan_node.debug_string(1, &out); + LOG(WARNING) << out.str(); + + status = scan_node.open(&_runtime_state); + ASSERT_TRUE(status.ok()); + RowBatch row_batch(scan_node._row_descriptor, _runtime_state.batch_size(), new MemTracker(-1)); + bool eos = false; + status = scan_node.get_next(&_runtime_state, &row_batch, &eos); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(2, row_batch.num_rows()); + ASSERT_TRUE(eos); + + status = scan_node.close(&_runtime_state); + ASSERT_TRUE(status.ok()); +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp index f2f4d9383e87f3..c99347119f1913 100644 --- a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp @@ -1,229 +1,229 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/bitshuffle_page.h" -#include "util/arena.h" -#include "util/logging.h" - -using doris::segment_v2::PageBuilderOptions; - -namespace doris { - -class BitShufflePageTest : public testing::Test { -public: - virtual ~BitShufflePageTest() {} - - template - void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { - Arena arena; - uint8_t null_bitmap = 0; - ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - size_t n = 1; - decoder->_copy_next_values(n, column_block_view.data()); - ASSERT_EQ(1, n); - } - - template - void test_encode_decode_page_template(typename TypeTraits::CppType* src, - size_t size) { - typedef typename TypeTraits::CppType CppType; - PageBuilderOptions options; - options.data_page_size = 256 * 1024; - PageBuilderType page_builder(options); - - page_builder.add(reinterpret_cast(src), &size); - Slice s = page_builder.finish(); - LOG(INFO) << "RLE Encoded size for 10k values: " << s.size - << ", original size:" << size * sizeof(CppType); - - segment_v2::PageDecoderOptions decoder_options; - PageDecoderType page_decoder(s, decoder_options); - Status status = page_decoder.init(); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(0, page_decoder.current_index()); - - Arena arena; - - CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); - uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); - ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - status = page_decoder.next_batch(&size, &column_block_view); - ASSERT_TRUE(status.ok()); - - CppType* decoded = (CppType*)values; - for (uint i = 0; i < size; i++) { - if (src[i] != decoded[i]) { - FAIL() << "Fail at index " << i << - " inserted=" << src[i] << " got=" << decoded[i]; - } - } - - // Test Seek within block by ordinal - for (int i = 0; i < 100; i++) { - int seek_off = random() % size; - page_decoder.seek_to_position_in_page(seek_off); - EXPECT_EQ((int32_t )(seek_off), page_decoder.current_index()); - CppType ret; - copy_one(&page_decoder, &ret); - EXPECT_EQ(decoded[seek_off], ret); - } - } -}; - -// Test for bitshuffle block, for INT32, INT64, FLOAT, DOUBLE -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt64BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int64_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleFloatBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr floats(new float[size]); - for (int i = 0; i < size; i++) { - floats.get()[i] = random() + static_cast(random())/INT_MAX; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(floats.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - doubles.get()[i] = random() + static_cast(random())/INT_MAX; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - doubles.get()[i] = 19880217.19890323; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderSequence) { - const uint32_t size = 10000; - - double base = 19880217.19890323; - double delta = 13.14; - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - base = base + delta; - doubles.get()[i] = base; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 1234567890; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - int32_t number = 0; - for (int i = 0; i < size; i++) { - ints.get()[i] = ++number; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - int32_t number = 0; - for (int i = 0; i < size; i++) { - ints.get()[i] = 1234567890 + number; - ++number; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "util/arena.h" +#include "util/logging.h" + +using doris::segment_v2::PageBuilderOptions; + +namespace doris { + +class BitShufflePageTest : public testing::Test { +public: + virtual ~BitShufflePageTest() {} + + template + void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { + Arena arena; + uint8_t null_bitmap = 0; + ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + size_t n = 1; + decoder->_copy_next_values(n, column_block_view.data()); + ASSERT_EQ(1, n); + } + + template + void test_encode_decode_page_template(typename TypeTraits::CppType* src, + size_t size) { + typedef typename TypeTraits::CppType CppType; + PageBuilderOptions options; + options.data_page_size = 256 * 1024; + PageBuilderType page_builder(options); + + page_builder.add(reinterpret_cast(src), &size); + Slice s = page_builder.finish(); + LOG(INFO) << "RLE Encoded size for 10k values: " << s.size + << ", original size:" << size * sizeof(CppType); + + segment_v2::PageDecoderOptions decoder_options; + PageDecoderType page_decoder(s, decoder_options); + Status status = page_decoder.init(); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(0, page_decoder.current_index()); + + Arena arena; + + CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); + uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); + ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + status = page_decoder.next_batch(&size, &column_block_view); + ASSERT_TRUE(status.ok()); + + CppType* decoded = (CppType*)values; + for (uint i = 0; i < size; i++) { + if (src[i] != decoded[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << src[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % size; + page_decoder.seek_to_position_in_page(seek_off); + EXPECT_EQ((int32_t )(seek_off), page_decoder.current_index()); + CppType ret; + copy_one(&page_decoder, &ret); + EXPECT_EQ(decoded[seek_off], ret); + } + } +}; + +// Test for bitshuffle block, for INT32, INT64, FLOAT, DOUBLE +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt64BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int64_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleFloatBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr floats(new float[size]); + for (int i = 0; i < size; i++) { + floats.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(floats.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = 19880217.19890323; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderSequence) { + const uint32_t size = 10000; + + double base = 19880217.19890323; + double delta = 13.14; + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + base = base + delta; + doubles.get()[i] = base; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 1234567890; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + int32_t number = 0; + for (int i = 0; i < size; i++) { + ints.get()[i] = ++number; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + int32_t number = 0; + for (int i = 0; i < size; i++) { + ints.get()[i] = 1234567890 + number; + ++number; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/olap/rowset/segment_v2/rle_page_test.cpp b/be/test/olap/rowset/segment_v2/rle_page_test.cpp index e30e45df5ec2c5..97015950ab8d69 100644 --- a/be/test/olap/rowset/segment_v2/rle_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/rle_page_test.cpp @@ -1,193 +1,193 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/rle_page.h" -#include "util/arena.h" -#include "util/logging.h" - -using doris::segment_v2::PageBuilderOptions; -using doris::segment_v2::PageDecoderOptions; - -namespace doris { - -class RlePageTest : public testing::Test { -public: - virtual ~RlePageTest() { } - - template - void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { - Arena arena; - uint8_t null_bitmap = 0; - ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - size_t n = 1; - decoder->next_batch(&n, &column_block_view); - ASSERT_EQ(1, n); - } - - template - void test_encode_decode_page_template(typename TypeTraits::CppType* src, - size_t size) { - typedef typename TypeTraits::CppType CppType; - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - PageBuilderType rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(src), &size); - Slice s = rle_page_builder.finish(); - ASSERT_EQ(size, rle_page_builder.count()); - LOG(INFO) << "RLE Encoded size for 10k values: " << s.size - << ", original size:" << size * sizeof(CppType); - - PageDecoderOptions decodeder_options; - PageDecoderType rle_page_decoder(s, decodeder_options); - Status status = rle_page_decoder.init(); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(0, rle_page_decoder.current_index()); - ASSERT_EQ(size, rle_page_decoder.count()); - - Arena arena; - - CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); - uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); - ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - size_t size_to_fetch = size; - status = rle_page_decoder.next_batch(&size_to_fetch, &column_block_view); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(size, size_to_fetch); - - for (uint i = 0; i < size; i++) { - if (src[i] != values[i]) { - FAIL() << "Fail at index " << i << - " inserted=" << src[i] << " got=" << values[i]; - } - } - - // Test Seek within block by ordinal - for (int i = 0; i < 100; i++) { - int seek_off = random() % size; - rle_page_decoder.seek_to_position_in_page(seek_off); - EXPECT_EQ((int32_t )(seek_off), rle_page_decoder.current_index()); - CppType ret; - copy_one(&rle_page_decoder, &ret); - EXPECT_EQ(values[seek_off], ret); - } - } -}; - -// Test for rle block, for INT32, BOOL -TEST_F(RlePageTest, TestRleInt32BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345; - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345 + i; - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderSize) { - size_t size = 100; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 0; - } - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - segment_v2::RlePageBuilder rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(ints.get()), &size); - Slice s = rle_page_builder.finish(); - // 4 bytes header - // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 - // 4 bytes values - ASSERT_EQ(10, s.size); -} - -TEST_F(RlePageTest, TestRleBoolBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr bools(new bool[size]); - for (int i = 0; i < size; i++) { - if (random() % 2 == 0) { - bools.get()[i] = true; - } else { - bools.get()[i] = false; - } - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(bools.get(), size); -} - -TEST_F(RlePageTest, TestRleBoolBlockEncoderSize) { - size_t size = 100; - - std::unique_ptr bools(new bool[size]); - for (int i = 0; i < size; i++) { - bools.get()[i] = true; - } - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - segment_v2::RlePageBuilder rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(bools.get()), &size); - Slice s = rle_page_builder.finish(); - // 4 bytes header - // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 - // 1 bytes values - ASSERT_EQ(7, s.size); -} - -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/rle_page.h" +#include "util/arena.h" +#include "util/logging.h" + +using doris::segment_v2::PageBuilderOptions; +using doris::segment_v2::PageDecoderOptions; + +namespace doris { + +class RlePageTest : public testing::Test { +public: + virtual ~RlePageTest() { } + + template + void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { + Arena arena; + uint8_t null_bitmap = 0; + ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + size_t n = 1; + decoder->next_batch(&n, &column_block_view); + ASSERT_EQ(1, n); + } + + template + void test_encode_decode_page_template(typename TypeTraits::CppType* src, + size_t size) { + typedef typename TypeTraits::CppType CppType; + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + PageBuilderType rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(src), &size); + Slice s = rle_page_builder.finish(); + ASSERT_EQ(size, rle_page_builder.count()); + LOG(INFO) << "RLE Encoded size for 10k values: " << s.size + << ", original size:" << size * sizeof(CppType); + + PageDecoderOptions decodeder_options; + PageDecoderType rle_page_decoder(s, decodeder_options); + Status status = rle_page_decoder.init(); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(0, rle_page_decoder.current_index()); + ASSERT_EQ(size, rle_page_decoder.count()); + + Arena arena; + + CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); + uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); + ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + size_t size_to_fetch = size; + status = rle_page_decoder.next_batch(&size_to_fetch, &column_block_view); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(size, size_to_fetch); + + for (uint i = 0; i < size; i++) { + if (src[i] != values[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << src[i] << " got=" << values[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % size; + rle_page_decoder.seek_to_position_in_page(seek_off); + EXPECT_EQ((int32_t )(seek_off), rle_page_decoder.current_index()); + CppType ret; + copy_one(&rle_page_decoder, &ret); + EXPECT_EQ(values[seek_off], ret); + } + } +}; + +// Test for rle block, for INT32, BOOL +TEST_F(RlePageTest, TestRleInt32BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345; + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345 + i; + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderSize) { + size_t size = 100; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 0; + } + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + segment_v2::RlePageBuilder rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(ints.get()), &size); + Slice s = rle_page_builder.finish(); + // 4 bytes header + // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 + // 4 bytes values + ASSERT_EQ(10, s.size); +} + +TEST_F(RlePageTest, TestRleBoolBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr bools(new bool[size]); + for (int i = 0; i < size; i++) { + if (random() % 2 == 0) { + bools.get()[i] = true; + } else { + bools.get()[i] = false; + } + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(bools.get(), size); +} + +TEST_F(RlePageTest, TestRleBoolBlockEncoderSize) { + size_t size = 100; + + std::unique_ptr bools(new bool[size]); + for (int i = 0; i < size; i++) { + bools.get()[i] = true; + } + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + segment_v2::RlePageBuilder rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(bools.get()), &size); + Slice s = rle_page_builder.finish(); + // 4 bytes header + // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 + // 1 bytes values + ASSERT_EQ(7, s.size); +} + +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/faststring_test.cpp b/be/test/util/faststring_test.cpp index 2a6120f3fd8da9..68231c0dc97992 100644 --- a/be/test/util/faststring_test.cpp +++ b/be/test/util/faststring_test.cpp @@ -1,83 +1,83 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include -#include - -#include "util/faststring.h" -#include "util/random.h" - -namespace doris { -class FaststringTest : public ::testing::Test {}; - -void RandomString(void* dest, size_t n, doris::Random* rng) { - size_t i = 0; - uint32_t random = rng->Next(); - char* cdest = static_cast(dest); - static const size_t sz = sizeof(random); - if (n >= sz) { - for (i = 0; i <= n - sz; i += sz) { - memcpy(&cdest[i], &random, sizeof(random)); - random = rng->Next(); - } - } - memcpy(cdest + i, &random, n - i); -} - -TEST_F(FaststringTest, TestShrinkToFit_Empty) { - faststring s; - s.shrink_to_fit(); - ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); -} - -// Test that, if the string contents is shorter than the initial capacity -// of the faststring, shrink_to_fit() leaves the string in the built-in -// array. -TEST_F(FaststringTest, TestShrinkToFit_SmallerThanInitialCapacity) { - faststring s; - s.append("hello"); - s.shrink_to_fit(); - ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); -} - -TEST_F(FaststringTest, TestShrinkToFit_Random) { - doris::Random r(time(nullptr)); - int kMaxSize = faststring::kInitialCapacity * 2; - std::unique_ptr random_bytes(new char[kMaxSize]); - RandomString(random_bytes.get(), kMaxSize, &r); - - faststring s; - for (int i = 0; i < 100; i++) { - int new_size = r.Uniform(kMaxSize); - s.resize(new_size); - memcpy(s.data(), random_bytes.get(), new_size); - s.shrink_to_fit(); - ASSERT_EQ(0, memcmp(s.data(), random_bytes.get(), new_size)); - ASSERT_EQ(std::max(faststring::kInitialCapacity, new_size), s.capacity()); - } -} - -} // namespace doris - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "util/faststring.h" +#include "util/random.h" + +namespace doris { +class FaststringTest : public ::testing::Test {}; + +void RandomString(void* dest, size_t n, doris::Random* rng) { + size_t i = 0; + uint32_t random = rng->Next(); + char* cdest = static_cast(dest); + static const size_t sz = sizeof(random); + if (n >= sz) { + for (i = 0; i <= n - sz; i += sz) { + memcpy(&cdest[i], &random, sizeof(random)); + random = rng->Next(); + } + } + memcpy(cdest + i, &random, n - i); +} + +TEST_F(FaststringTest, TestShrinkToFit_Empty) { + faststring s; + s.shrink_to_fit(); + ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); +} + +// Test that, if the string contents is shorter than the initial capacity +// of the faststring, shrink_to_fit() leaves the string in the built-in +// array. +TEST_F(FaststringTest, TestShrinkToFit_SmallerThanInitialCapacity) { + faststring s; + s.append("hello"); + s.shrink_to_fit(); + ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); +} + +TEST_F(FaststringTest, TestShrinkToFit_Random) { + doris::Random r(time(nullptr)); + int kMaxSize = faststring::kInitialCapacity * 2; + std::unique_ptr random_bytes(new char[kMaxSize]); + RandomString(random_bytes.get(), kMaxSize, &r); + + faststring s; + for (int i = 0; i < 100; i++) { + int new_size = r.Uniform(kMaxSize); + s.resize(new_size); + memcpy(s.data(), random_bytes.get(), new_size); + s.shrink_to_fit(); + ASSERT_EQ(0, memcmp(s.data(), random_bytes.get(), new_size)); + ASSERT_EQ(std::max(faststring::kInitialCapacity, new_size), s.capacity()); + } +} + +} // namespace doris + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/rle_encoding_test.cpp b/be/test/util/rle_encoding_test.cpp index 50c92c707da2fd..8c8491ca44d412 100644 --- a/be/test/util/rle_encoding_test.cpp +++ b/be/test/util/rle_encoding_test.cpp @@ -1,426 +1,426 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include - -// Must come before gtest.h. -#include -#include -#include - -#include "util/bit_stream_utils.h" -#include "util/bit_stream_utils.inline.h" -#include "util/bit_util.h" -#include "util/faststring.h" -#include "util/rle_encoding.h" -#include "util/debug_util.h" - -using std::string; -using std::vector; - -namespace doris { - -const int kMaxWidth = 64; - -class TestRle : public testing::Test {}; -// Validates encoding of values by encoding and decoding them. If -// expected_encoding != NULL, also validates that the encoded buffer is -// exactly 'expected_encoding'. -// if expected_len is not -1, it will validate the encoded size is correct. -template -void ValidateRle(const vector& values, int bit_width, - uint8_t* expected_encoding, int expected_len) { - faststring buffer; - RleEncoder encoder(&buffer, bit_width); - - for (const auto& value : values) { - encoder.Put(value); - } - int encoded_len = encoder.Flush(); - - if (expected_len != -1) { - EXPECT_EQ(encoded_len, expected_len); - } - if (expected_encoding != nullptr) { - EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0) - << "\n" - << "Expected: " << hexdump((const char*)expected_encoding, expected_len) << "\n" - << "Got: " << hexdump((const char*)buffer.data(), buffer.size()); - } - - // Verify read - RleDecoder decoder(buffer.data(), encoded_len, bit_width); - for (const auto& value : values) { - T val = 0; - bool result = decoder.Get(&val); - EXPECT_TRUE(result); - EXPECT_EQ(value, val); - } -} - -TEST(Rle, SpecificSequences) { - const int kTestLen = 1024; - uint8_t expected_buffer[kTestLen]; - vector values; - - // Test 50 0' followed by 50 1's - values.resize(100); - for (int i = 0; i < 50; ++i) { - values[i] = 0; - } - for (int i = 50; i < 100; ++i) { - values[i] = 1; - } - - // expected_buffer valid for bit width <= 1 byte - expected_buffer[0] = (50 << 1); - expected_buffer[1] = 0; - expected_buffer[2] = (50 << 1); - expected_buffer[3] = 1; - for (int width = 1; width <= 8; ++width) { - ValidateRle(values, width, expected_buffer, 4); - } - - for (int width = 9; width <= kMaxWidth; ++width) { - ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8))); - } - - // Test 100 0's and 1's alternating - for (int i = 0; i < 100; ++i) { - values[i] = i % 2; - } - int num_groups = BitUtil::Ceil(100, 8); - expected_buffer[0] = (num_groups << 1) | 1; - for (int i = 0; i < 100/8; ++i) { - expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa - } - // Values for the last 4 0 and 1's - expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a - - // num_groups and expected_buffer only valid for bit width = 1 - ValidateRle(values, 1, expected_buffer, 1 + num_groups); - for (int width = 2; width <= kMaxWidth; ++width) { - ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8)); - } -} - -// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value -// is used, otherwise alternating values are used. -void TestRleValues(int bit_width, int num_vals, int value = -1) { - const uint64_t mod = bit_width == 64 ? 1ULL : 1ULL << bit_width; - vector values; - for (uint64_t v = 0; v < num_vals; ++v) { - values.push_back((value != -1) ? value : (bit_width == 64 ? v : (v % mod))); - } - ValidateRle(values, bit_width, nullptr, -1); -} - -TEST(Rle, TestValues) { - for (int width = 1; width <= kMaxWidth; ++width) { - TestRleValues(width, 1); - TestRleValues(width, 1024); - TestRleValues(width, 1024, 0); - TestRleValues(width, 1024, 1); - } -} - -class BitRle : public testing::Test { -public: - BitRle() { - } - - virtual ~BitRle() { - } -}; - -// Tests all true/false values -TEST_F(BitRle, AllSame) { - const int kTestLen = 1024; - vector values; - - for (int v = 0; v < 2; ++v) { - values.clear(); - for (int i = 0; i < kTestLen; ++i) { - values.push_back(v ? true : false); - } - - ValidateRle(values, 1, nullptr, 3); - } -} - -// Test that writes out a repeated group and then a literal -// group but flush before finishing. -TEST_F(BitRle, Flush) { - vector values; - for (int i = 0; i < 16; ++i) values.push_back(1); - values.push_back(false); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); -} - -// Test some random bool sequences. -TEST_F(BitRle, RandomBools) { - int iters = 0; - const int n_iters = 20; - while (iters < n_iters) { - srand(iters++); - if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; - vector values; - bool parity = 0; - for (int i = 0; i < 1000; ++i) { - int group_size = rand() % 20 + 1; // NOLINT(*) - if (group_size > 16) { - group_size = 1; - } - for (int i = 0; i < group_size; ++i) { - values.push_back(parity); - } - parity = !parity; - } - ValidateRle(values, (iters % kMaxWidth) + 1, nullptr, -1); - } -} - -// Test some random 64-bit sequences. -TEST_F(BitRle, Random64Bit) { - int iters = 0; - const int n_iters = 20; - while (iters < n_iters) { - srand(iters++); - if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; - vector values; - for (int i = 0; i < 1000; ++i) { - int group_size = rand() % 20 + 1; // NOLINT(*) - uint64_t cur_value = (static_cast(rand()) << 32) + static_cast(rand()); - if (group_size > 16) { - group_size = 1; - } - for (int i = 0; i < group_size; ++i) { - values.push_back(cur_value); - } - - } - ValidateRle(values, 64, nullptr, -1); - } -} - -// Test a sequence of 1 0's, 2 1's, 3 0's. etc -// e.g. 011000111100000 -TEST_F(BitRle, RepeatedPattern) { - vector values; - const int min_run = 1; - const int max_run = 32; - - for (int i = min_run; i <= max_run; ++i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - // And go back down again - for (int i = max_run; i >= min_run; --i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - ValidateRle(values, 1, nullptr, -1); -} - -TEST_F(TestRle, TestBulkPut) { - size_t run_length; - bool val = false; - - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - encoder.Put(true, 10); - encoder.Put(false, 7); - encoder.Put(true, 5); - encoder.Put(true, 15); - encoder.Flush(); - - RleDecoder decoder(buffer.data(), encoder.len(), 1); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(10, run_length); - - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(7, run_length); - - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(20, run_length); - - ASSERT_EQ(0, decoder.GetNextRun(&val, std::numeric_limits::max())); -} - -TEST_F(TestRle, TestGetNextRun) { - // Repeat the test with different number of items - for (int num_items = 7; num_items < 200; num_items += 13) { - // Test different block patterns - // 1: 01010101 01010101 - // 2: 00110011 00110011 - // 3: 00011100 01110001 - // ... - for (int block = 1; block <= 20; ++block) { - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - for (int j = 0; j < num_items; ++j) { - encoder.Put(!!(j & 1), block); - } - encoder.Flush(); - - RleDecoder decoder(buffer.data(), encoder.len(), 1); - size_t count = num_items * block; - for (int j = 0; j < num_items; ++j) { - size_t run_length; - bool val = false; - DCHECK_GT(count, 0); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - run_length = std::min(run_length, count); - - ASSERT_EQ(!!(j & 1), val); - ASSERT_EQ(block, run_length); - count -= run_length; - } - DCHECK_EQ(count, 0); - } - } -} - -// Generate a random bit string which consists of 'num_runs' runs, -// each with a random length between 1 and 100. Returns the number -// of values encoded (i.e the sum run length). -static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) { - RleEncoder enc(enc_buf, 1); - int num_bits = 0; - for (int i = 0; i < num_runs; i++) { - int run_length = random() % 100; - bool value = static_cast(i & 1); - enc.Put(value, run_length); - string_rep->append(run_length, value ? '1' : '0'); - num_bits += run_length; - } - enc.Flush(); - return num_bits; -} - -TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) { - srand(time(nullptr)); - - // Test the limiting function of GetNextRun. - const int kMaxToReadAtOnce = (random() % 20) + 1; - - // Generate a bunch of random bit sequences, and "round-trip" them - // through the encode/decode sequence. - for (int rep = 0; rep < 100; rep++) { - faststring buf; - string string_rep; - int num_bits = GenerateRandomBitString(10, &buf, &string_rep); - RleDecoder decoder(buf.data(), buf.size(), 1); - string roundtrip_str; - int rem_to_read = num_bits; - size_t run_len; - bool val; - while (rem_to_read > 0 && - (run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) { - ASSERT_LE(run_len, kMaxToReadAtOnce); - roundtrip_str.append(run_len, val ? '1' : '0'); - rem_to_read -= run_len; - } - - ASSERT_EQ(string_rep, roundtrip_str); - } -} -TEST_F(TestRle, TestSkip) { - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - - // 0101010[1] 01010101 01 - // "A" - for (int j = 0; j < 18; ++j) { - encoder.Put(!!(j & 1)); - } - - // 0011[00] 11001100 11001100 11001100 11001100 - // "B" - for (int j = 0; j < 19; ++j) { - encoder.Put(!!(j & 1), 2); - } - - // 000000000000 11[1111111111] 000000000000 111111111111 - // "C" - // 000000000000 111111111111 0[00000000000] 111111111111 - // "D" - // 000000000000 111111111111 000000000000 111111111111 - for (int j = 0; j < 12; ++j) { - encoder.Put(!!(j & 1), 12); - } - encoder.Flush(); - - bool val = false; - size_t run_length; - RleDecoder decoder(buffer.data(), encoder.len(), 1); - - // position before "A" - ASSERT_EQ(3, decoder.Skip(7)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(1, run_length); - - // position before "B" - ASSERT_EQ(7, decoder.Skip(14)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(2, run_length); - - // position before "C" - ASSERT_EQ(18, decoder.Skip(46)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(10, run_length); - - // position before "D" - ASSERT_EQ(24, decoder.Skip(49)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(11, run_length); - - encoder.Flush(); -} - -} // namespace doris - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +// Must come before gtest.h. +#include +#include +#include + +#include "util/bit_stream_utils.h" +#include "util/bit_stream_utils.inline.h" +#include "util/bit_util.h" +#include "util/faststring.h" +#include "util/rle_encoding.h" +#include "util/debug_util.h" + +using std::string; +using std::vector; + +namespace doris { + +const int kMaxWidth = 64; + +class TestRle : public testing::Test {}; +// Validates encoding of values by encoding and decoding them. If +// expected_encoding != NULL, also validates that the encoded buffer is +// exactly 'expected_encoding'. +// if expected_len is not -1, it will validate the encoded size is correct. +template +void ValidateRle(const vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { + faststring buffer; + RleEncoder encoder(&buffer, bit_width); + + for (const auto& value : values) { + encoder.Put(value); + } + int encoded_len = encoder.Flush(); + + if (expected_len != -1) { + EXPECT_EQ(encoded_len, expected_len); + } + if (expected_encoding != nullptr) { + EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0) + << "\n" + << "Expected: " << hexdump((const char*)expected_encoding, expected_len) << "\n" + << "Got: " << hexdump((const char*)buffer.data(), buffer.size()); + } + + // Verify read + RleDecoder decoder(buffer.data(), encoded_len, bit_width); + for (const auto& value : values) { + T val = 0; + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(value, val); + } +} + +TEST(Rle, SpecificSequences) { + const int kTestLen = 1024; + uint8_t expected_buffer[kTestLen]; + vector values; + + // Test 50 0' followed by 50 1's + values.resize(100); + for (int i = 0; i < 50; ++i) { + values[i] = 0; + } + for (int i = 50; i < 100; ++i) { + values[i] = 1; + } + + // expected_buffer valid for bit width <= 1 byte + expected_buffer[0] = (50 << 1); + expected_buffer[1] = 0; + expected_buffer[2] = (50 << 1); + expected_buffer[3] = 1; + for (int width = 1; width <= 8; ++width) { + ValidateRle(values, width, expected_buffer, 4); + } + + for (int width = 9; width <= kMaxWidth; ++width) { + ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8))); + } + + // Test 100 0's and 1's alternating + for (int i = 0; i < 100; ++i) { + values[i] = i % 2; + } + int num_groups = BitUtil::Ceil(100, 8); + expected_buffer[0] = (num_groups << 1) | 1; + for (int i = 0; i < 100/8; ++i) { + expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa + } + // Values for the last 4 0 and 1's + expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a + + // num_groups and expected_buffer only valid for bit width = 1 + ValidateRle(values, 1, expected_buffer, 1 + num_groups); + for (int width = 2; width <= kMaxWidth; ++width) { + ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8)); + } +} + +// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value +// is used, otherwise alternating values are used. +void TestRleValues(int bit_width, int num_vals, int value = -1) { + const uint64_t mod = bit_width == 64 ? 1ULL : 1ULL << bit_width; + vector values; + for (uint64_t v = 0; v < num_vals; ++v) { + values.push_back((value != -1) ? value : (bit_width == 64 ? v : (v % mod))); + } + ValidateRle(values, bit_width, nullptr, -1); +} + +TEST(Rle, TestValues) { + for (int width = 1; width <= kMaxWidth; ++width) { + TestRleValues(width, 1); + TestRleValues(width, 1024); + TestRleValues(width, 1024, 0); + TestRleValues(width, 1024, 1); + } +} + +class BitRle : public testing::Test { +public: + BitRle() { + } + + virtual ~BitRle() { + } +}; + +// Tests all true/false values +TEST_F(BitRle, AllSame) { + const int kTestLen = 1024; + vector values; + + for (int v = 0; v < 2; ++v) { + values.clear(); + for (int i = 0; i < kTestLen; ++i) { + values.push_back(v ? true : false); + } + + ValidateRle(values, 1, nullptr, 3); + } +} + +// Test that writes out a repeated group and then a literal +// group but flush before finishing. +TEST_F(BitRle, Flush) { + vector values; + for (int i = 0; i < 16; ++i) values.push_back(1); + values.push_back(false); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); +} + +// Test some random bool sequences. +TEST_F(BitRle, RandomBools) { + int iters = 0; + const int n_iters = 20; + while (iters < n_iters) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + bool parity = 0; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT(*) + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(parity); + } + parity = !parity; + } + ValidateRle(values, (iters % kMaxWidth) + 1, nullptr, -1); + } +} + +// Test some random 64-bit sequences. +TEST_F(BitRle, Random64Bit) { + int iters = 0; + const int n_iters = 20; + while (iters < n_iters) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT(*) + uint64_t cur_value = (static_cast(rand()) << 32) + static_cast(rand()); + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(cur_value); + } + + } + ValidateRle(values, 64, nullptr, -1); + } +} + +// Test a sequence of 1 0's, 2 1's, 3 0's. etc +// e.g. 011000111100000 +TEST_F(BitRle, RepeatedPattern) { + vector values; + const int min_run = 1; + const int max_run = 32; + + for (int i = min_run; i <= max_run; ++i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + // And go back down again + for (int i = max_run; i >= min_run; --i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + ValidateRle(values, 1, nullptr, -1); +} + +TEST_F(TestRle, TestBulkPut) { + size_t run_length; + bool val = false; + + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + encoder.Put(true, 10); + encoder.Put(false, 7); + encoder.Put(true, 5); + encoder.Put(true, 15); + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(7, run_length); + + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(20, run_length); + + ASSERT_EQ(0, decoder.GetNextRun(&val, std::numeric_limits::max())); +} + +TEST_F(TestRle, TestGetNextRun) { + // Repeat the test with different number of items + for (int num_items = 7; num_items < 200; num_items += 13) { + // Test different block patterns + // 1: 01010101 01010101 + // 2: 00110011 00110011 + // 3: 00011100 01110001 + // ... + for (int block = 1; block <= 20; ++block) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + for (int j = 0; j < num_items; ++j) { + encoder.Put(!!(j & 1), block); + } + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + size_t count = num_items * block; + for (int j = 0; j < num_items; ++j) { + size_t run_length; + bool val = false; + DCHECK_GT(count, 0); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + run_length = std::min(run_length, count); + + ASSERT_EQ(!!(j & 1), val); + ASSERT_EQ(block, run_length); + count -= run_length; + } + DCHECK_EQ(count, 0); + } + } +} + +// Generate a random bit string which consists of 'num_runs' runs, +// each with a random length between 1 and 100. Returns the number +// of values encoded (i.e the sum run length). +static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) { + RleEncoder enc(enc_buf, 1); + int num_bits = 0; + for (int i = 0; i < num_runs; i++) { + int run_length = random() % 100; + bool value = static_cast(i & 1); + enc.Put(value, run_length); + string_rep->append(run_length, value ? '1' : '0'); + num_bits += run_length; + } + enc.Flush(); + return num_bits; +} + +TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) { + srand(time(nullptr)); + + // Test the limiting function of GetNextRun. + const int kMaxToReadAtOnce = (random() % 20) + 1; + + // Generate a bunch of random bit sequences, and "round-trip" them + // through the encode/decode sequence. + for (int rep = 0; rep < 100; rep++) { + faststring buf; + string string_rep; + int num_bits = GenerateRandomBitString(10, &buf, &string_rep); + RleDecoder decoder(buf.data(), buf.size(), 1); + string roundtrip_str; + int rem_to_read = num_bits; + size_t run_len; + bool val; + while (rem_to_read > 0 && + (run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) { + ASSERT_LE(run_len, kMaxToReadAtOnce); + roundtrip_str.append(run_len, val ? '1' : '0'); + rem_to_read -= run_len; + } + + ASSERT_EQ(string_rep, roundtrip_str); + } +} +TEST_F(TestRle, TestSkip) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + + // 0101010[1] 01010101 01 + // "A" + for (int j = 0; j < 18; ++j) { + encoder.Put(!!(j & 1)); + } + + // 0011[00] 11001100 11001100 11001100 11001100 + // "B" + for (int j = 0; j < 19; ++j) { + encoder.Put(!!(j & 1), 2); + } + + // 000000000000 11[1111111111] 000000000000 111111111111 + // "C" + // 000000000000 111111111111 0[00000000000] 111111111111 + // "D" + // 000000000000 111111111111 000000000000 111111111111 + for (int j = 0; j < 12; ++j) { + encoder.Put(!!(j & 1), 12); + } + encoder.Flush(); + + bool val = false; + size_t run_length; + RleDecoder decoder(buffer.data(), encoder.len(), 1); + + // position before "A" + ASSERT_EQ(3, decoder.Skip(7)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(1, run_length); + + // position before "B" + ASSERT_EQ(7, decoder.Skip(14)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(2, run_length); + + // position before "C" + ASSERT_EQ(18, decoder.Skip(46)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + // position before "D" + ASSERT_EQ(24, decoder.Skip(49)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(11, run_length); + + encoder.Flush(); +} + +} // namespace doris + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/docs/documentation/cn/community/subscribe-mail-list.md b/docs/documentation/cn/community/subscribe-mail-list.md index dbd23c074f0859..9f56cd0611f83d 100644 --- a/docs/documentation/cn/community/subscribe-mail-list.md +++ b/docs/documentation/cn/community/subscribe-mail-list.md @@ -1,43 +1,43 @@ -# 订阅邮件列表 - +# 订阅邮件列表 + 邮件列表(Mail List)是 Apache 社区最被认可的交流方式。一般来说,开源社区的提问与解答、技术讨论、事务决策等都通过邮件列表来承载。邮件列表异步、广播的特性,也非常适合开源社区的沟通交流。那么,如何订阅 Apache Doris (incubating) 的邮件列表呢?主要包括以下五个步骤。 - -## 1. 发送订阅邮件 - -打开自己的邮箱,新建邮件,向`dev-subscribe@doris.apache.org`发送一封邮件(邮件主题和内容任意) - -![step1](../../../resources/images/subscribe-mail-list-step1.png) - -## 2. 接收来自 dev-help@doris.apache.org 的确认邮件 - -执行完第一步之后,您将收到一封来自`dev-help@doris.apache.org`的确认邮件,邮件内容如下图所示。(**如果长时间未能收到,请确认该邮件是否已被拦截,或已经被自动归入“订阅邮件”、“垃圾邮件”、“推广邮件”等文件夹**) + +## 1. 发送订阅邮件 + +打开自己的邮箱,新建邮件,向`dev-subscribe@doris.apache.org`发送一封邮件(邮件主题和内容任意) + +![step1](../../../resources/images/subscribe-mail-list-step1.png) + +## 2. 接收来自 dev-help@doris.apache.org 的确认邮件 + +执行完第一步之后,您将收到一封来自`dev-help@doris.apache.org`的确认邮件,邮件内容如下图所示。(**如果长时间未能收到,请确认该邮件是否已被拦截,或已经被自动归入“订阅邮件”、“垃圾邮件”、“推广邮件”等文件夹**) ![step2](../../../resources/images/subscribe-mail-list-step2.png) - -## 3. 回复确认邮件 - -​针对上一步接收到的邮件, - -​**a.直接回复该邮件** - -​***或*** - -**b. 新建一封`收件人`为上一步中的`回复地址`的邮件** - -​均可,内容主题不限 - -![step3](../../../resources/images/subscribe-mail-list-step3.png) - - -## 4. 接收欢迎邮件 - -​完成第三步之后,将会受到一封标题为**WELCOME to dev@doris.apache.org**的欢迎邮件。至此,订阅邮件列表的工作已经完成了,社区的动态都会通过邮件的方式通知您。 - -![step4](../../../resources/images/subscribe-mail-list-step4.png) - - -## 5. 发起邮件讨论(可选) - + +## 3. 回复确认邮件 + +​针对上一步接收到的邮件, + +​**a.直接回复该邮件** + +​***或*** + +**b. 新建一封`收件人`为上一步中的`回复地址`的邮件** + +​均可,内容主题不限 + +![step3](../../../resources/images/subscribe-mail-list-step3.png) + + +## 4. 接收欢迎邮件 + +​完成第三步之后,将会受到一封标题为**WELCOME to dev@doris.apache.org**的欢迎邮件。至此,订阅邮件列表的工作已经完成了,社区的动态都会通过邮件的方式通知您。 + +![step4](../../../resources/images/subscribe-mail-list-step4.png) + + +## 5. 发起邮件讨论(可选) + ​成功订阅邮件列表后,若想发起讨论,直接往`dev@doris.apache.org`发送邮件即可。所有订阅了邮件列表的人都会收到邮件。 ​ ​ \ No newline at end of file diff --git a/docs/documentation/cn/internal/doris_storage_optimization.md b/docs/documentation/cn/internal/doris_storage_optimization.md index 51661cefd4969b..ab34b206aa97d5 100644 --- a/docs/documentation/cn/internal/doris_storage_optimization.md +++ b/docs/documentation/cn/internal/doris_storage_optimization.md @@ -1,206 +1,206 @@ -# Doris存储文件格式优化 # - -## 文件格式 ## - -![](../../../resources/images/segment_v2.png) -
图1. doris segment文件格式
- -文件包括: -- 文件开始是8个字节的magic code,用于识别文件格式和版本 -- Data Region:用于存储各个列的数据信息,这里的数据是按需分page加载的 -- Index Region: doris中将各个列的index数据统一存储在Index Region,这里的数据会按照列粒度进行加载,所以跟列的数据信息分开存储 -- Footer信息 - - FileFooterPB:定义文件的元数据信息 - - 4个字节的footer pb内容的checksum - - 4个字节的FileFooterPB消息长度,用于读取FileFooterPB - - 8个字节的MAGIC CODE,之所以在末位存储,是方便不同的场景进行文件类型的识别 - -文件中的数据按照page的方式进行组织,page是编码和压缩的基本单位。现在的page类型包括以下几种: - -### DataPage ### - -DataPage分为两种:nullable和non-nullable的data page。 - -nullable的data page内容包括: -``` - - +----------------+ - | value count | - |----------------| - | first row id | - |----------------| - | bitmap length | - |----------------| - | null bitmap | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ -``` - -non-nullable data page结构如下: - -``` - |----------------| - | value count | - |----------------| - | first row id | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ -``` - -其中各个字段含义如下: - -- value count - - 表示page中的行数 -- first row id - - page中第一行的行号 -- bitmap length - - 表示接下来bitmap的字节数 -- null bitmap - - 表示null信息的bitmap -- data - - 存储经过encoding和compress之后的数据 - - 需要在数据的头部信息中写入:is_compressed - - 各种不同编码的data需要在头部信息写入一些字段信息,以实现数据的解析 - - TODO:添加各种encoding的header信息 -- checksum - - 存储page粒度的校验和,包括page的header和之后的实际数据 - - -### Bloom Filter Pages ### - -针对每个bloom filter列,会在page的粒度相应的生成一个bloom filter的page,保存在bloom filter pages区域 - -### Ordinal Index Page ### - -针对每个列,都会按照page粒度,建立行号的稀疏索引。内容为这个page的起始行的行号到这个block的指针(包括offset和length) - -### Short Key Index page ### - -我们会每隔N行(可配置)生成一个short key的稀疏索引,索引的内容为:short key->行号(ordinal) - -### Column的其他索引 ### - -该格式设计支持后续扩展其他的索引信息,比如bitmap索引,spatial索引等等,只需要将需要的数据写到现有的列数据后面,并且添加对应的元数据字段到FileFooterPB中 - -### 元数据定义 ### -FileFooterPB的定义为: - -``` -message ColumnPB { - optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 - optional string type = 2; // 列类型 - optional string aggregation = 3; // 是否聚合 - optional uint32 length = 4; // 长度 - optional bool is_key = 5; // 是否是主键列 - optional string default_value = 6; // 默认值 - optional uint32 precision = 9 [default = 27]; // 精度 - optional uint32 frac = 10 [default = 9]; - optional bool is_nullable = 11 [default=false]; // 是否有null - optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 - optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 -} - -// page偏移 -message PagePointerPB { - required uint64 offset; // page在文件中的偏移 - required uint32 length; // page的大小 -} - -message MetadataPairPB { - optional string key = 1; - optional bytes value = 2; -} - -message ColumnMetaPB { - optional ColumnMessage encoding; // 编码方式 - - optional PagePointerPB dict_page // 词典page - repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 - optional PagePointerPB ordinal_index_page; // 行号索引数据 - optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 - - optional PagePointerPB bitmap_index_page; // bitmap索引数据 - - optional uint64 data_footprint; // 列中索引的大小 - optional uint64 index_footprint; // 列中数据的大小 - optional uint64 raw_data_footprint; // 原始列数据大小 - - optional CompressKind compress_kind; // 列的压缩方式 - - optional ZoneMapPB column_zone_map; //文件级别的过滤条件 - repeated MetadataPairPB column_meta_datas; -} - -message FileFooterPB { - optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 - repeated ColumnPB schema = 5; // 列Schema - optional uint64 num_values = 4; // 文件中保存的行数 - optional uint64 index_footprint = 7; // 索引大小 - optional uint64 data_footprint = 8; // 数据大小 - optional uint64 raw_data_footprint = 8; // 原始数据大小 - - optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 - repeated ColumnMetaPB column_metas = 10; // 列元数据 - optional PagePointerPB key_index_page; // short key索引page -} - -``` - -## 读写逻辑 ## - -### 写入 ### - -大体的写入流程如下: -1. 写入magic -2. 根据schema信息,生成对应的ColumnWriter,每个ColumnWriter按照不同的类型,获取对应的encoding信息(可配置),根据encoding,生成对应的encoder -3. 调用encoder->add(value)进行数据写入,每个K行,生成一个short key index entry,并且,如果当前的page满足一定条件(大小超过1M或者行数为K),就生成一个新的page,缓存在内存中。 -4. 不断的循环步骤3,直到数据写入完成。将各个列的数据依序刷入文件中 -5. 生成FileFooterPB信息,写入文件中。 - -相关的问题: - -- short key的索引如何生成? - - 现在还是按照每隔多少行生成一个short key的稀疏索引,保持每隔1024行生成一个short的稀疏索引,具体的内容是:short key -> ordinal - -- ordinal索引里面应该存什么? - - 存储page的第一个ordinal到page pointer的映射信息 -- 不同encoding类型的page里存什么? - - 词典压缩 - - plain - - rle - - bshuf - -### 读取 ### - -1. 读取文件的magic,判断文件类型和版本 -2. 读取FileFooterPB,进行checksum校验 -3. 按照需要的列,读取short key索引和对应列的数据ordinal索引信息 -4. 使用start key和end key,通过short key索引定位到要读取的行号,然后通过ordinal索引确定需要读取的row ranges, 同时需要通过统计信息、bitmap索引等过滤需要读取的row ranges -5. 然后按照row ranges通过ordinal索引读取行的数据 - -相关的问题: -1. 如何实现在page内部快速的定位到某一行? - - page内部是的数据是经过encoding的,无法快速进行行级数据的定位。不同的encoding方式,在内部进行快速的行号定位的方案不一样,需要具体分析: - - 如果是rle编码的,需要通过解析rle的header进行skip,直到到达包含该行的那个rle块之后,再进行反解。 - - binary plain encoding:会在page的中存储offset信息,并且会在page header中指定offset信息的offset,读取的时候会先解析offset信息到数组中,这样子就可以通过各个行的offset数据信息快速的定位block某一行的数据 -2. 如何实现块的高效读取?可以考虑将相邻的块在读取的时候进行merge,一次性读取? - 这个需要在读取的时候,判断block是否连续,如果连续,就一次性的读取 - -## 编码 ## - -现有的doris存储中,针对string类型的编码,采用plain encoding的方式,效率比较低。经过对比,发现在百度统计的场景下,数据会因为string类型的编码膨胀超过一倍。所以,计划引入基于词典的编码压缩。 - -## 压缩 ## - -实现可扩展的压缩框架,支持多种压缩算法,方便后续添加新的压缩算法,计划引入zstd压缩。 - -## TODO ## -1. 如何实现嵌套类型?如何在嵌套类型中进行行号定位? -2. 如何优化现在的ScanRange拆分导致的下游bitmap、column statistic统计等进行多次? +# Doris存储文件格式优化 # + +## 文件格式 ## + +![](../../../resources/images/segment_v2.png) +
图1. doris segment文件格式
+ +文件包括: +- 文件开始是8个字节的magic code,用于识别文件格式和版本 +- Data Region:用于存储各个列的数据信息,这里的数据是按需分page加载的 +- Index Region: doris中将各个列的index数据统一存储在Index Region,这里的数据会按照列粒度进行加载,所以跟列的数据信息分开存储 +- Footer信息 + - FileFooterPB:定义文件的元数据信息 + - 4个字节的footer pb内容的checksum + - 4个字节的FileFooterPB消息长度,用于读取FileFooterPB + - 8个字节的MAGIC CODE,之所以在末位存储,是方便不同的场景进行文件类型的识别 + +文件中的数据按照page的方式进行组织,page是编码和压缩的基本单位。现在的page类型包括以下几种: + +### DataPage ### + +DataPage分为两种:nullable和non-nullable的data page。 + +nullable的data page内容包括: +``` + + +----------------+ + | value count | + |----------------| + | first row id | + |----------------| + | bitmap length | + |----------------| + | null bitmap | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +non-nullable data page结构如下: + +``` + |----------------| + | value count | + |----------------| + | first row id | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +其中各个字段含义如下: + +- value count + - 表示page中的行数 +- first row id + - page中第一行的行号 +- bitmap length + - 表示接下来bitmap的字节数 +- null bitmap + - 表示null信息的bitmap +- data + - 存储经过encoding和compress之后的数据 + - 需要在数据的头部信息中写入:is_compressed + - 各种不同编码的data需要在头部信息写入一些字段信息,以实现数据的解析 + - TODO:添加各种encoding的header信息 +- checksum + - 存储page粒度的校验和,包括page的header和之后的实际数据 + + +### Bloom Filter Pages ### + +针对每个bloom filter列,会在page的粒度相应的生成一个bloom filter的page,保存在bloom filter pages区域 + +### Ordinal Index Page ### + +针对每个列,都会按照page粒度,建立行号的稀疏索引。内容为这个page的起始行的行号到这个block的指针(包括offset和length) + +### Short Key Index page ### + +我们会每隔N行(可配置)生成一个short key的稀疏索引,索引的内容为:short key->行号(ordinal) + +### Column的其他索引 ### + +该格式设计支持后续扩展其他的索引信息,比如bitmap索引,spatial索引等等,只需要将需要的数据写到现有的列数据后面,并且添加对应的元数据字段到FileFooterPB中 + +### 元数据定义 ### +FileFooterPB的定义为: + +``` +message ColumnPB { + optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 + optional string type = 2; // 列类型 + optional string aggregation = 3; // 是否聚合 + optional uint32 length = 4; // 长度 + optional bool is_key = 5; // 是否是主键列 + optional string default_value = 6; // 默认值 + optional uint32 precision = 9 [default = 27]; // 精度 + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; // 是否有null + optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 + optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 +} + +// page偏移 +message PagePointerPB { + required uint64 offset; // page在文件中的偏移 + required uint32 length; // page的大小 +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +message ColumnMetaPB { + optional ColumnMessage encoding; // 编码方式 + + optional PagePointerPB dict_page // 词典page + repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 + optional PagePointerPB ordinal_index_page; // 行号索引数据 + optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 + + optional PagePointerPB bitmap_index_page; // bitmap索引数据 + + optional uint64 data_footprint; // 列中索引的大小 + optional uint64 index_footprint; // 列中数据的大小 + optional uint64 raw_data_footprint; // 原始列数据大小 + + optional CompressKind compress_kind; // 列的压缩方式 + + optional ZoneMapPB column_zone_map; //文件级别的过滤条件 + repeated MetadataPairPB column_meta_datas; +} + +message FileFooterPB { + optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 + repeated ColumnPB schema = 5; // 列Schema + optional uint64 num_values = 4; // 文件中保存的行数 + optional uint64 index_footprint = 7; // 索引大小 + optional uint64 data_footprint = 8; // 数据大小 + optional uint64 raw_data_footprint = 8; // 原始数据大小 + + optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 + repeated ColumnMetaPB column_metas = 10; // 列元数据 + optional PagePointerPB key_index_page; // short key索引page +} + +``` + +## 读写逻辑 ## + +### 写入 ### + +大体的写入流程如下: +1. 写入magic +2. 根据schema信息,生成对应的ColumnWriter,每个ColumnWriter按照不同的类型,获取对应的encoding信息(可配置),根据encoding,生成对应的encoder +3. 调用encoder->add(value)进行数据写入,每个K行,生成一个short key index entry,并且,如果当前的page满足一定条件(大小超过1M或者行数为K),就生成一个新的page,缓存在内存中。 +4. 不断的循环步骤3,直到数据写入完成。将各个列的数据依序刷入文件中 +5. 生成FileFooterPB信息,写入文件中。 + +相关的问题: + +- short key的索引如何生成? + - 现在还是按照每隔多少行生成一个short key的稀疏索引,保持每隔1024行生成一个short的稀疏索引,具体的内容是:short key -> ordinal + +- ordinal索引里面应该存什么? + - 存储page的第一个ordinal到page pointer的映射信息 +- 不同encoding类型的page里存什么? + - 词典压缩 + - plain + - rle + - bshuf + +### 读取 ### + +1. 读取文件的magic,判断文件类型和版本 +2. 读取FileFooterPB,进行checksum校验 +3. 按照需要的列,读取short key索引和对应列的数据ordinal索引信息 +4. 使用start key和end key,通过short key索引定位到要读取的行号,然后通过ordinal索引确定需要读取的row ranges, 同时需要通过统计信息、bitmap索引等过滤需要读取的row ranges +5. 然后按照row ranges通过ordinal索引读取行的数据 + +相关的问题: +1. 如何实现在page内部快速的定位到某一行? + + page内部是的数据是经过encoding的,无法快速进行行级数据的定位。不同的encoding方式,在内部进行快速的行号定位的方案不一样,需要具体分析: + - 如果是rle编码的,需要通过解析rle的header进行skip,直到到达包含该行的那个rle块之后,再进行反解。 + - binary plain encoding:会在page的中存储offset信息,并且会在page header中指定offset信息的offset,读取的时候会先解析offset信息到数组中,这样子就可以通过各个行的offset数据信息快速的定位block某一行的数据 +2. 如何实现块的高效读取?可以考虑将相邻的块在读取的时候进行merge,一次性读取? + 这个需要在读取的时候,判断block是否连续,如果连续,就一次性的读取 + +## 编码 ## + +现有的doris存储中,针对string类型的编码,采用plain encoding的方式,效率比较低。经过对比,发现在百度统计的场景下,数据会因为string类型的编码膨胀超过一倍。所以,计划引入基于词典的编码压缩。 + +## 压缩 ## + +实现可扩展的压缩框架,支持多种压缩算法,方便后续添加新的压缩算法,计划引入zstd压缩。 + +## TODO ## +1. 如何实现嵌套类型?如何在嵌套类型中进行行号定位? +2. 如何优化现在的ScanRange拆分导致的下游bitmap、column statistic统计等进行多次? diff --git a/docs/documentation/en/internal/doris_storage_optimization_EN.md b/docs/documentation/en/internal/doris_storage_optimization_EN.md index ef7721e8dcfe63..0376aa0631e127 100644 --- a/docs/documentation/en/internal/doris_storage_optimization_EN.md +++ b/docs/documentation/en/internal/doris_storage_optimization_EN.md @@ -22,35 +22,35 @@ The data in the file is organized in the form of page, which is the basic unit o Data Page is divided into two types: nullable and non-nullable data pages. Nullable's data page includes: -``` - - +----------------+ - | value count | - |----------------| - | first row id | - |----------------| - | bitmap length | - |----------------| - | null bitmap | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ +``` + + +----------------+ + | value count | + |----------------| + | first row id | + |----------------| + | bitmap length | + |----------------| + | null bitmap | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ ``` non -zero data page32467;- 26500;- 229140;- -``` - |----------------| - | value count | - |----------------| - | first row id | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ +``` + |----------------| + | value count | + |----------------| + | first row id | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ ``` The meanings of each field are as follows: @@ -91,65 +91,65 @@ The format design supports the subsequent expansion of other index information, ### Metadata Definition### FileFooterPB is defined as: -``` -message ColumnPB { - optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 - optional string type = 2; // 列类型 - optional string aggregation = 3; // 是否聚合 - optional uint32 length = 4; // 长度 - optional bool is_key = 5; // 是否是主键列 - optional string default_value = 6; // 默认值 - optional uint32 precision = 9 [default = 27]; // 精度 - optional uint32 frac = 10 [default = 9]; - optional bool is_nullable = 11 [default=false]; // 是否有null - optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 - optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 -} - -// page偏移 -message PagePointerPB { - required uint64 offset; // page在文件中的偏移 - required uint32 length; // page的大小 -} - -message MetadataPairPB { - optional string key = 1; - optional bytes value = 2; -} - -message ColumnMetaPB { - optional ColumnMessage encoding; // 编码方式 - - optional PagePointerPB dict_page // 词典page - repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 - optional PagePointerPB ordinal_index_page; // 行号索引数据 - optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 - - optional PagePointerPB bitmap_index_page; // bitmap索引数据 - - optional uint64 data_footprint; // 列中索引的大小 - optional uint64 index_footprint; // 列中数据的大小 - optional uint64 raw_data_footprint; // 原始列数据大小 - - optional CompressKind compress_kind; // 列的压缩方式 - - optional ZoneMapPB column_zone_map; //文件级别的过滤条件 - repeated MetadataPairPB column_meta_datas; -} - -message FileFooterPB { - optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 - repeated ColumnPB schema = 5; // 列Schema - optional uint64 num_values = 4; // 文件中保存的行数 - optional uint64 index_footprint = 7; // 索引大小 - optional uint64 data_footprint = 8; // 数据大小 - optional uint64 raw_data_footprint = 8; // 原始数据大小 - - optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 - repeated ColumnMetaPB column_metas = 10; // 列元数据 - optional PagePointerPB key_index_page; // short key索引page -} - +``` +message ColumnPB { + optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 + optional string type = 2; // 列类型 + optional string aggregation = 3; // 是否聚合 + optional uint32 length = 4; // 长度 + optional bool is_key = 5; // 是否是主键列 + optional string default_value = 6; // 默认值 + optional uint32 precision = 9 [default = 27]; // 精度 + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; // 是否有null + optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 + optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 +} + +// page偏移 +message PagePointerPB { + required uint64 offset; // page在文件中的偏移 + required uint32 length; // page的大小 +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +message ColumnMetaPB { + optional ColumnMessage encoding; // 编码方式 + + optional PagePointerPB dict_page // 词典page + repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 + optional PagePointerPB ordinal_index_page; // 行号索引数据 + optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 + + optional PagePointerPB bitmap_index_page; // bitmap索引数据 + + optional uint64 data_footprint; // 列中索引的大小 + optional uint64 index_footprint; // 列中数据的大小 + optional uint64 raw_data_footprint; // 原始列数据大小 + + optional CompressKind compress_kind; // 列的压缩方式 + + optional ZoneMapPB column_zone_map; //文件级别的过滤条件 + repeated MetadataPairPB column_meta_datas; +} + +message FileFooterPB { + optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 + repeated ColumnPB schema = 5; // 列Schema + optional uint64 num_values = 4; // 文件中保存的行数 + optional uint64 index_footprint = 7; // 索引大小 + optional uint64 data_footprint = 8; // 数据大小 + optional uint64 raw_data_footprint = 8; // 原始数据大小 + + optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 + repeated ColumnMetaPB column_metas = 10; // 列元数据 + optional PagePointerPB key_index_page; // short key索引page +} + ``` ## Read-write logic##