Skip to content

Commit

Permalink
enhance: [2.4] Use MARISA_LABEL_ORDER when building trie index (#36060
Browse files Browse the repository at this point in the history
)

Cherry pick from master
pr: #36034

Related to #35941
Previous PR: #35943

This PR make `Trie` index using `MARISA_LABEL_ORDER`, which make
predictive search iterating in lexicographic order.

When trie index is build in label order, lexicographc could be utilized
accelerating `Range` operations.

However according to the official document, using `MARISA_LABEL_ORDER`
will make "exact match lookup, common prefix search, and predictive
search" slower.

---------

Signed-off-by: Congqi Xia <[email protected]>
  • Loading branch information
congqixia authored Sep 6, 2024
1 parent 64e109d commit aeb576e
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 22 deletions.
122 changes: 100 additions & 22 deletions internal/core/src/index/StringIndexMarisa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "index/StringIndexMarisa.h"
#include "index/Utils.h"
#include "index/Index.h"
#include "marisa/base.h"
#include "storage/Util.h"
#include "storage/space.h"

Expand Down Expand Up @@ -151,7 +152,7 @@ StringIndexMarisa::BuildWithFieldData(
}
total_num_rows += slice_num;
}
trie_.build(keyset);
trie_.build(keyset, MARISA_LABEL_ORDER);

// fill str_ids_
str_ids_.resize(total_num_rows);
Expand Down Expand Up @@ -186,7 +187,7 @@ StringIndexMarisa::Build(size_t n, const std::string* values) {
}
}

trie_.build(keyset);
trie_.build(keyset, MARISA_LABEL_ORDER);
fill_str_ids(n, values);
fill_offsets();

Expand Down Expand Up @@ -393,44 +394,103 @@ StringIndexMarisa::Range(std::string value, OpType op) {
TargetBitmap bitset(count);
std::vector<size_t> ids;
marisa::Agent agent;
bool in_lexico_order = in_lexicographic_order();
switch (op) {
case OpType::GreaterThan: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
if (in_lexico_order) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
ids.push_back(agent.key().id());
break;
}
};
// since in lexicographic order, all following nodes is greater than value
while (trie_.predictive_search(agent)) {
ids.push_back(agent.key().id());
}
};
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::GreaterEqual: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
if (in_lexico_order) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
ids.push_back(agent.key().id());
break;
}
};
// since in lexicographic order, all following nodes is greater than or equal value
while (trie_.predictive_search(agent)) {
ids.push_back(agent.key().id());
}
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::LessThan: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
break;
if (in_lexico_order) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
break;
}
ids.push_back(agent.key().id());
}
ids.push_back(agent.key().id());
break;
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key < value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::LessEqual: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
break;
if (in_lexico_order) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
break;
}
ids.push_back(agent.key().id());
}
ids.push_back(agent.key().id());
break;
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key <= value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
default:
PanicInfo(
Expand Down Expand Up @@ -460,6 +520,8 @@ StringIndexMarisa::Range(std::string lower_bound_value,
return bitset;
}

bool in_lexico_oder = in_lexicographic_order();

auto common_prefix = GetCommonPrefix(lower_bound_value, upper_bound_value);
marisa::Agent agent;
agent.set_query(common_prefix.c_str());
Expand All @@ -469,7 +531,12 @@ StringIndexMarisa::Range(std::string lower_bound_value,
std::string_view(agent.key().ptr(), agent.key().length());
if (val > upper_bound_value ||
(!ub_inclusive && val == upper_bound_value)) {
break;
// we could only break when trie in lexicographic order.
if (in_lexico_oder) {
break;
} else {
continue;
}
}

if (val < lower_bound_value ||
Expand Down Expand Up @@ -561,4 +628,15 @@ StringIndexMarisa::Reverse_Lookup(size_t offset) const {
return std::string(agent.key().ptr(), agent.key().length());
}

bool
StringIndexMarisa::in_lexicographic_order() {
// by default, marisa trie uses `MARISA_WEIGHT_ORDER` to build trie
// so `predictive_search` will not iterate in lexicographic order
// now we build trie using `MARISA_LABEL_ORDER` and also handle old index in weight order.
if (trie_.node_order() == MARISA_LABEL_ORDER) {
return true;
}

return false;
}
} // namespace milvus::index
3 changes: 3 additions & 0 deletions internal/core/src/index/StringIndexMarisa.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ class StringIndexMarisa : public StringIndex {
std::vector<size_t>
prefix_match(const std::string_view prefix);

bool
in_lexicographic_order();

void
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
Expand Down

0 comments on commit aeb576e

Please sign in to comment.