Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update diskann config for config check #896

Merged
merged 1 commit into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/knowhere/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,8 @@ class BaseConfig : public Config {
CFG_BOOL retrieve_friendly;
CFG_STRING data_path;
CFG_STRING index_prefix;
// the size of the raw vector data
CFG_FLOAT vec_field_size_gb;
// for distance metrics, we search for vectors with distance in [range_filter, radius).
// for similarity metrics, we search for vectors with similarity in (radius, range_filter].
CFG_FLOAT radius;
Expand Down Expand Up @@ -559,6 +561,10 @@ class BaseConfig : public Config {
.allow_empty_without_default()
.for_train()
.for_deserialize();
KNOWHERE_CONFIG_DECLARE_FIELD(vec_field_size_gb)
.description("the size (in GB) of the raw vector data.")
.set_default(0)
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(k)
.set_default(10)
.description("search for top k similar vector.")
Expand Down
30 changes: 29 additions & 1 deletion src/index/diskann/diskann_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ class DiskANNConfig : public BaseConfig {
// complexity. Plz set this value larger than the max_degree unless you need to build indices really quickly and can
// somewhat compromise on quality.
CFG_INT search_list_size;

// The ratio of the size reserved for the pq code to the size of the raw data (defined with vec_field_size_gb)
// This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side.
// The index can indirectly obtain pq_code_budget_gb by vec_field_size_gb * pq_code_budget_gb_ratio
CFG_FLOAT pq_code_budget_gb_ratio;
// Limit the size of the PQ code after the raw vector has been PQ-encoded. PQ code is a (pq_code_budget_gb * 1024 *
// 1024 * 1024) / row_num)-dimensional uint8 vector. If pq_code_budget_gb is too large, it will be adjusted to the
// size of dim*row_num.
Expand All @@ -50,6 +55,12 @@ class DiskANNConfig : public BaseConfig {
// This is the flag to enable fast build, in which we will not build vamana graph by full 2 round. This can
// accelerate index build ~30% with an ~1% recall regression.
CFG_BOOL accelerate_build;

// The ratio of the size reserved for the search cache to the size of the raw data (defined with vec_field_size_gb)
// This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side.
// The index can indirectly obtain search_cache_budget_gb by vec_field_size_gb * search_cache_budget_gb_ratio
CFG_FLOAT search_cache_budget_gb_ratio;
foxspy marked this conversation as resolved.
Show resolved Hide resolved

// While serving the index, the entire graph is stored on SSD. For faster search performance, you can cache a few
// frequently accessed nodes in memory.
CFG_FLOAT search_cache_budget_gb;
Expand Down Expand Up @@ -86,12 +97,19 @@ class DiskANNConfig : public BaseConfig {
.for_search()
.for_range_search()
.for_iterator();
KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb_ratio)
.description("the size of PQ compared with vector field data")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb)
.description("the size of PQ compressed representation in GB.")
.description("the ratio of the size reserved for the pq code to the size of the raw data.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(build_dram_budget_gb)
.description("limit on the memory allowed for building the index in GB.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(disk_pq_dims)
Expand All @@ -102,6 +120,12 @@ class DiskANNConfig : public BaseConfig {
.description("a flag to enbale fast build.")
.set_default(false)
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb_ratio)
.description("the ratio of the size reserved for the search cache to the size of the raw data.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train()
.for_deserialize();
KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb)
.description("the size of cached nodes in GB.")
.set_default(0)
Expand Down Expand Up @@ -148,6 +172,10 @@ class DiskANNConfig : public BaseConfig {
if (!search_list_size.has_value()) {
search_list_size = kDefaultSearchListSizeForBuild;
}
pq_code_budget_gb =
std::max(pq_code_budget_gb.value(), pq_code_budget_gb_ratio.value() * vec_field_size_gb.value());
search_cache_budget_gb = std::max(search_cache_budget_gb.value(),
search_cache_budget_gb_ratio.value() * vec_field_size_gb.value());
foxspy marked this conversation as resolved.
Show resolved Hide resolved
break;
}
case PARAM_TYPE::SEARCH: {
Expand Down
46 changes: 46 additions & 0 deletions tests/ut/test_diskann.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,52 @@ WriteRawDataToDisk(const std::string data_path, const DataType* raw_data, const
}

} // namespace
TEST_CASE("Valid diskann build params test", "[diskann]") {
int rows_num = 1000000;
auto version = GenTestVersionList();

auto ratio = GENERATE(as<float>{}, 0.01, 0.1, 0.125);

float pq_code_budget_gb = sizeof(float) * kDim * rows_num * 0.125 / (1024 * 1024 * 1024);
float search_cache_budget_gb = sizeof(float) * kDim * rows_num * 0.05 / (1024 * 1024 * 1024);

auto test_gen = [&]() {
knowhere::Json json;
json["dim"] = kDim;
json["metric_type"] = "L2";
json["k"] = 100;
json["index_prefix"] = kL2IndexPrefix;
json["data_path"] = kRawDataPath;
json["max_degree"] = 24;
json["search_list_size"] = 64;
json["vec_field_size_gb"] = 1.0;
json["pq_code_budget_gb_ratio"] = ratio;
json["pq_code_budget_gb"] = pq_code_budget_gb;
json["build_dram_budget_gb"] = 32.0;
json["search_cache_budget_gb_ratio"] = ratio;
json["search_cache_budget_gb"] = search_cache_budget_gb;
json["beamwidth"] = 8;
json["min_k"] = 10;
json["max_k"] = 8000;
return json;
};

SECTION("Dynamic param check") {
knowhere::Json test_json = test_gen();

auto cfg = knowhere::IndexStaticFaced<float>::CreateConfig(knowhere::IndexEnum::INDEX_DISKANN, version);
knowhere::Json json_(test_json);
std::string msg;
auto res = knowhere::Config::FormatAndCheck(*cfg, json_, &msg);
REQUIRE(res == knowhere::Status::success);
res = knowhere::Config::Load(*cfg, json_, knowhere::PARAM_TYPE::TRAIN, &msg);
REQUIRE(res == knowhere::Status::success);

knowhere::DiskANNConfig diskCfg = static_cast<const knowhere::DiskANNConfig&>(*cfg);
REQUIRE(diskCfg.pq_code_budget_gb == std::max(pq_code_budget_gb, 1.0f * ratio));
REQUIRE(diskCfg.search_cache_budget_gb == std::max(search_cache_budget_gb, 1.0f * ratio));
}
}

TEST_CASE("Invalid diskann params test", "[diskann]") {
fs::remove_all(kDir);
Expand Down
Loading