Skip to content

Commit

Permalink
enhance: Use MARISA_LABEL_ORDER when building trie index
Browse files Browse the repository at this point in the history
Related to milvus-io#35941
Previous PR: milvus-io#35943

This PR make `Trie` index using `MARISA_LABEL_ORDER`, which make
predictive search iterating in lexicographic order.

When trie index is build in label order, lexicographc could be utilized
accelerating `Range` operations.

However according to the official document, using `MARISA_LABEL_ORDER`
will make "exact match lookup, common prefix search, and predictive search"
slower.

Signed-off-by: Congqi Xia <[email protected]>
  • Loading branch information
congqixia committed Sep 6, 2024
1 parent 55b33cd commit d317c5d
Showing 1 changed file with 87 additions and 21 deletions.
108 changes: 87 additions & 21 deletions internal/core/src/index/StringIndexMarisa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "index/StringIndexMarisa.h"
#include "index/Utils.h"
#include "index/Index.h"
#include "marisa/base.h"
#include "storage/Util.h"
#include "storage/space.h"

Expand Down Expand Up @@ -151,7 +152,7 @@ StringIndexMarisa::BuildWithFieldData(
}
total_num_rows += slice_num;
}
trie_.build(keyset);
trie_.build(keyset, MARISA_LABEL_ORDER);

// fill str_ids_
str_ids_.resize(total_num_rows);
Expand Down Expand Up @@ -186,7 +187,7 @@ StringIndexMarisa::Build(size_t n, const std::string* values) {
}
}

trie_.build(keyset);
trie_.build(keyset, MARISA_LABEL_ORDER);
fill_str_ids(n, values);
fill_offsets();

Expand Down Expand Up @@ -393,44 +394,109 @@ StringIndexMarisa::Range(std::string value, OpType op) {
TargetBitmap bitset(count);
std::vector<size_t> ids;
marisa::Agent agent;
bool inLexicoOrder = false;
// by default, marisa trie uses `MARISA_WEIGHT_ORDER` to build trie
// so `predictive_search` will not iterate in lexicographic order
// now we build trie using `MARISA_LABEL_ORDER` and also handle old index in weight order.
if (trie_.node_order() == MARISA_LABEL_ORDER) {
inLexicoOrder = true;
}
switch (op) {
case OpType::GreaterThan: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
if (inLexicoOrder) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
ids.push_back(agent.key().id());
break;
}
};
// since in lexicographic order, all following nodes is greater than value
while (trie_.predictive_search(agent)) {
ids.push_back(agent.key().id());
}
};
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::GreaterEqual: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
if (inLexicoOrder) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
ids.push_back(agent.key().id());
break;
}
};
// since in lexicographic order, all following nodes is greater than or equal value
while (trie_.predictive_search(agent)) {
ids.push_back(agent.key().id());
}
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::LessThan: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
break;
if (inLexicoOrder) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key >= value) {
break;
}
ids.push_back(agent.key().id());
}
ids.push_back(agent.key().id());
break;
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key < value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
case OpType::LessEqual: {
while (trie_.predictive_search(agent)) {
auto key = std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
break;
if (inLexicoOrder) {
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key > value) {
break;
}
ids.push_back(agent.key().id());
}
ids.push_back(agent.key().id());
break;
} else {
// lexicographic order is not guaranteed, check all values
while (trie_.predictive_search(agent)) {
auto key =
std::string(agent.key().ptr(), agent.key().length());
if (key <= value) {
ids.push_back(agent.key().id());
}
};
}
break;
}
default:
PanicInfo(
Expand Down

0 comments on commit d317c5d

Please sign in to comment.