Skip to content

Commit

Permalink
[fix](invert index) supports utf8 and non-utf8 strings (#22570)
Browse files Browse the repository at this point in the history
supports utf8 and non-utf8 strings: [fix] compatible with utf8 and invalid utf8 doris-thirdparty#110
  • Loading branch information
zzzxl1993 authored Aug 5, 2023
1 parent 6fe0aa4 commit fe6bae2
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 5 deletions.
5 changes: 3 additions & 2 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <CLucene/util/CLStreams.h>
#include <CLucene/util/FutureArrays.h>
#include <CLucene/util/bkd/bkd_docid_iterator.h>
#include <CLucene/util/stringUtil.h>
#include <math.h>
#include <string.h>

Expand Down Expand Up @@ -141,7 +142,7 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result(
if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
if (token.termLength<char>() != 0) {
std::string_view term(token.termBuffer<char>(), token.termLength<char>());
std::wstring ws_term = lucene_utf8stows(term);
std::wstring ws_term = StringUtil::string_to_wstring(term);
analyse_result.emplace_back(ws_term);
}
} else {
Expand Down Expand Up @@ -443,7 +444,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " << search_str;
std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end());
std::wstring search_str_ws = lucene_utf8stows(search_str);
std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
// unique_ptr with custom deleter
std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
_CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()),
Expand Down
4 changes: 2 additions & 2 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ Doris倒排索引的功能简要介绍如下:
- chinese是中文分词,适合被索引列主要是中文的情况,性能比english分词低
- unicode是多语言混合类型分词,适用于中英文混合、多语言混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文按字符分词。
- parser_mode用于指定分词的模式,目前parser = chinese时支持如下几种模式:
- fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉长江大桥' 会分成 '武汉', '武汉市', '市长', '长江', '长江大桥', '大桥' 6个词
- coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉长江大桥' 会分成 '武汉市' '长江大桥' 2个词
- fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉市长江大桥' 会分成 '武汉', '武汉市', '市长', '长江', '长江大桥', '大桥' 6个词
- coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉市长江大桥' 会分成 '武汉市' '长江大桥' 2个词
- 默认coarse_grained
- support_phrase用于指定索引是否支持MATCH_PHRASE短语查询加速
- true为支持,但是索引需要更多的存储空间
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 330204195805121025

-- !sql --
2 36

-- !sql --
2 330225197806187713

-- !sql --
2 330227195911020791

-- !sql --
2 330224196312012744

-- !sql --
2 330205196003131214

-- !sql --
2 330224197301242119

-- !sql --
2 3302哈哈1645676

-- !sql --
2 330225196202011579

-- !sql --
2 33022719660610183x

-- !sql --
2 330225197801043198

-- !sql --
3 中国

-- !sql --
3 美国

-- !sql --
3 英国

-- !sql --
3 体育

-- !sql --
3 体育场

-- !sql --
3 中国人

-- !sql --
3 北京市

-- !sql --
3 我在北京市

-- !sql --
3 我在西安市

Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.


suite("test_inverted_index_keyword"){
// prepare test table


def timeout = 60000
def delta_time = 1000
def alter_res = "null"
def useTime = 0

def indexTblName = "test_inverted_index_keyword"

sql "DROP TABLE IF EXISTS ${indexTblName}"
// create 1 replica table
sql """
CREATE TABLE IF NOT EXISTS ${indexTblName}(
`id`int(11)NULL,
`c` text NULL,
INDEX c_idx(`c`) USING INVERTED COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`id`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES(
"replication_allocation" = "tag.location.default: 1"
);
"""

def var_result = sql "show variables"
logger.info("show variales result: " + var_result )

sql """INSERT INTO ${indexTblName} VALUES
(1, '330204195805121025'),
(2, '36'),
(2, '330225197806187713'),
(2, '330227195911020791'),
(2, '330224196312012744'),
(2, '330205196003131214'),
(2, '330224197301242119'),
(2, '3302哈哈1645676'),
(2, '330225196202011579'),
(2, '33022719660610183x'),
(2, '330225197801043198'),
(3, '中国'),
(3, '美国'),
(3, '英国'),
(3, '体育'),
(3, '体育场'),
(3, '中国人'),
(3, '北京市'),
(3, '我在北京市'),
(3, '我在西安市')
"""

qt_sql "SELECT * FROM ${indexTblName} where c match '330204195805121025'";
qt_sql "SELECT * FROM ${indexTblName} where c match '36'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330225197806187713'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330227195911020791'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330224196312012744'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330205196003131214'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330224197301242119'";
qt_sql "SELECT * FROM ${indexTblName} where c match '3302哈哈1645676'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330225196202011579'";
qt_sql "SELECT * FROM ${indexTblName} where c match '33022719660610183x'";
qt_sql "SELECT * FROM ${indexTblName} where c match '330225197801043198'";
qt_sql "SELECT * FROM ${indexTblName} where c match '中国'";
qt_sql "SELECT * FROM ${indexTblName} where c match '美国'";
qt_sql "SELECT * FROM ${indexTblName} where c match '英国'";
qt_sql "SELECT * FROM ${indexTblName} where c match '体育'";
qt_sql "SELECT * FROM ${indexTblName} where c match '体育场'";
qt_sql "SELECT * FROM ${indexTblName} where c match '中国人'";
qt_sql "SELECT * FROM ${indexTblName} where c match '北京市'";
qt_sql "SELECT * FROM ${indexTblName} where c match '我在北京市'";
qt_sql "SELECT * FROM ${indexTblName} where c match '我在西安市'";
}

0 comments on commit fe6bae2

Please sign in to comment.