Skip to content

Commit

Permalink
update diskann config for config check (#896)
Browse files Browse the repository at this point in the history
Signed-off-by: xianliang.li <[email protected]>
  • Loading branch information
foxspy authored Oct 16, 2024
1 parent db498ab commit 538e416
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 1 deletion.
6 changes: 6 additions & 0 deletions include/knowhere/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,8 @@ class BaseConfig : public Config {
CFG_BOOL retrieve_friendly;
CFG_STRING data_path;
CFG_STRING index_prefix;
// the size of the raw vector data
CFG_FLOAT vec_field_size_gb;
// for distance metrics, we search for vectors with distance in [range_filter, radius).
// for similarity metrics, we search for vectors with similarity in (radius, range_filter].
CFG_FLOAT radius;
Expand Down Expand Up @@ -559,6 +561,10 @@ class BaseConfig : public Config {
.allow_empty_without_default()
.for_train()
.for_deserialize();
KNOWHERE_CONFIG_DECLARE_FIELD(vec_field_size_gb)
.description("the size (in GB) of the raw vector data.")
.set_default(0)
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(k)
.set_default(10)
.description("search for top k similar vector.")
Expand Down
30 changes: 29 additions & 1 deletion src/index/diskann/diskann_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ class DiskANNConfig : public BaseConfig {
// complexity. Plz set this value larger than the max_degree unless you need to build indices really quickly and can
// somewhat compromise on quality.
CFG_INT search_list_size;

// The ratio of the size reserved for the pq code to the size of the raw data (defined with vec_field_size_gb)
// This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side.
// The index can indirectly obtain pq_code_budget_gb by vec_field_size_gb * pq_code_budget_gb_ratio
CFG_FLOAT pq_code_budget_gb_ratio;
// Limit the size of the PQ code after the raw vector has been PQ-encoded. PQ code is a (pq_code_budget_gb * 1024 *
// 1024 * 1024) / row_num)-dimensional uint8 vector. If pq_code_budget_gb is too large, it will be adjusted to the
// size of dim*row_num.
Expand All @@ -50,6 +55,12 @@ class DiskANNConfig : public BaseConfig {
// This is the flag to enable fast build, in which we will not build vamana graph by full 2 round. This can
// accelerate index build ~30% with an ~1% recall regression.
CFG_BOOL accelerate_build;

// The ratio of the size reserved for the search cache to the size of the raw data (defined with vec_field_size_gb)
// This parameter will replace pq_code_budget_gb to avoid calculating the actual size on the Milvus side.
// The index can indirectly obtain search_cache_budget_gb by vec_field_size_gb * search_cache_budget_gb_ratio
CFG_FLOAT search_cache_budget_gb_ratio;

// While serving the index, the entire graph is stored on SSD. For faster search performance, you can cache a few
// frequently accessed nodes in memory.
CFG_FLOAT search_cache_budget_gb;
Expand Down Expand Up @@ -86,12 +97,19 @@ class DiskANNConfig : public BaseConfig {
.for_search()
.for_range_search()
.for_iterator();
KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb_ratio)
.description("the size of PQ compared with vector field data")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(pq_code_budget_gb)
.description("the size of PQ compressed representation in GB.")
.description("the ratio of the size reserved for the pq code to the size of the raw data.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(build_dram_budget_gb)
.description("limit on the memory allowed for building the index in GB.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(disk_pq_dims)
Expand All @@ -102,6 +120,12 @@ class DiskANNConfig : public BaseConfig {
.description("a flag to enbale fast build.")
.set_default(false)
.for_train();
KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb_ratio)
.description("the ratio of the size reserved for the search cache to the size of the raw data.")
.set_default(0)
.set_range(0, std::numeric_limits<CFG_FLOAT::value_type>::max())
.for_train()
.for_deserialize();
KNOWHERE_CONFIG_DECLARE_FIELD(search_cache_budget_gb)
.description("the size of cached nodes in GB.")
.set_default(0)
Expand Down Expand Up @@ -148,6 +172,10 @@ class DiskANNConfig : public BaseConfig {
if (!search_list_size.has_value()) {
search_list_size = kDefaultSearchListSizeForBuild;
}
pq_code_budget_gb =
std::max(pq_code_budget_gb.value(), pq_code_budget_gb_ratio.value() * vec_field_size_gb.value());
search_cache_budget_gb = std::max(search_cache_budget_gb.value(),
search_cache_budget_gb_ratio.value() * vec_field_size_gb.value());
break;
}
case PARAM_TYPE::SEARCH: {
Expand Down
46 changes: 46 additions & 0 deletions tests/ut/test_diskann.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,52 @@ WriteRawDataToDisk(const std::string data_path, const DataType* raw_data, const
}

} // namespace
TEST_CASE("Valid diskann build params test", "[diskann]") {
int rows_num = 1000000;
auto version = GenTestVersionList();

auto ratio = GENERATE(as<float>{}, 0.01, 0.1, 0.125);

float pq_code_budget_gb = sizeof(float) * kDim * rows_num * 0.125 / (1024 * 1024 * 1024);
float search_cache_budget_gb = sizeof(float) * kDim * rows_num * 0.05 / (1024 * 1024 * 1024);

auto test_gen = [&]() {
knowhere::Json json;
json["dim"] = kDim;
json["metric_type"] = "L2";
json["k"] = 100;
json["index_prefix"] = kL2IndexPrefix;
json["data_path"] = kRawDataPath;
json["max_degree"] = 24;
json["search_list_size"] = 64;
json["vec_field_size_gb"] = 1.0;
json["pq_code_budget_gb_ratio"] = ratio;
json["pq_code_budget_gb"] = pq_code_budget_gb;
json["build_dram_budget_gb"] = 32.0;
json["search_cache_budget_gb_ratio"] = ratio;
json["search_cache_budget_gb"] = search_cache_budget_gb;
json["beamwidth"] = 8;
json["min_k"] = 10;
json["max_k"] = 8000;
return json;
};

SECTION("Dynamic param check") {
knowhere::Json test_json = test_gen();

auto cfg = knowhere::IndexStaticFaced<float>::CreateConfig(knowhere::IndexEnum::INDEX_DISKANN, version);
knowhere::Json json_(test_json);
std::string msg;
auto res = knowhere::Config::FormatAndCheck(*cfg, json_, &msg);
REQUIRE(res == knowhere::Status::success);
res = knowhere::Config::Load(*cfg, json_, knowhere::PARAM_TYPE::TRAIN, &msg);
REQUIRE(res == knowhere::Status::success);

knowhere::DiskANNConfig diskCfg = static_cast<const knowhere::DiskANNConfig&>(*cfg);
REQUIRE(diskCfg.pq_code_budget_gb == std::max(pq_code_budget_gb, 1.0f * ratio));
REQUIRE(diskCfg.search_cache_budget_gb == std::max(search_cache_budget_gb, 1.0f * ratio));
}
}

TEST_CASE("Invalid diskann params test", "[diskann]") {
fs::remove_all(kDir);
Expand Down

0 comments on commit 538e416

Please sign in to comment.