Skip to content

Commit

Permalink
Support of skip_ids in merge_from_multiple function of OnDiskInverted…
Browse files Browse the repository at this point in the history
…Lists (facebookresearch#3327)

Summary:

**Context**
1. [Issue 2621](facebookresearch#2621) discuss inconsistency between OnDiskInvertedList and InvertedList. OnDiskInvertedList is supposed to handle disk based multiple Index Shards. Thus, we should name it differently when merging invls from index shard.
2. [Issue 2876](facebookresearch#2876) provides usecase of shifting ids when merging invls from different shards.

**In this diff**,
1. To address facebookresearch#1 above, I renamed the merge_from function to merge_from_multiple without touching merge_from base class.
why so? To continue to allow merge invl from one index to ondiskinvl from other index.

2. To address facebookresearch#2 above, I have added support of shift_ids in merge_from_multiple to shift ids from different shards. This can be used when each shard has same set of ids but different data. This is not recommended if id is already unique across shards.

Differential Revision: D55482518
  • Loading branch information
kuarora authored and facebook-github-bot committed Mar 28, 2024
1 parent 03db694 commit b741509
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 8 deletions.
2 changes: 1 addition & 1 deletion faiss/invlists/InvertedLists.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ struct InvertedLists {
* high level functions */

/// move all entries from oivf (empty on output)
void merge_from(InvertedLists* oivf, size_t add_id);
virtual void merge_from(InvertedLists* oivf, size_t add_id);

// how to copy a subset of elements from the inverted lists
// This depends on two integers, a1 and a2.
Expand Down
24 changes: 20 additions & 4 deletions faiss/invlists/OnDiskInvertedLists.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,22 +565,28 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
/*****************************************
* Compact form
*****************************************/

size_t OnDiskInvertedLists::merge_from(
size_t OnDiskInvertedLists::merge_from_multiple(
const InvertedLists** ils,
int n_il,
bool shift_ids,
bool verbose) {
FAISS_THROW_IF_NOT_MSG(
totsize == 0, "works only on an empty InvertedLists");

std::vector<size_t> sizes(nlist);
std::vector<size_t> shift_id_offsets(n_il);
for (int i = 0; i < n_il; i++) {
const InvertedLists* il = ils[i];
FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);

size_t il_totsize = 0;
for (size_t j = 0; j < nlist; j++) {
sizes[j] += il->list_size(j);
il_totsize += il->list_size(j);
}

shift_id_offsets[i] =
(shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
}

size_t cums = 0;
Expand All @@ -605,11 +611,21 @@ size_t OnDiskInvertedLists::merge_from(
const InvertedLists* il = ils[i];
size_t n_entry = il->list_size(j);
l.size += n_entry;
ScopedIds scope_ids(il, j);
const idx_t* scope_ids_data = scope_ids.get();
std::vector<idx_t> new_ids;
if (shift_ids) {
new_ids.resize(n_entry);
for (size_t k = 0; k < n_entry; k++) {
new_ids[k] = scope_ids[k] + shift_id_offsets[i];
}
scope_ids_data = new_ids.data();
}
update_entries(
j,
l.size - n_entry,
n_entry,
ScopedIds(il, j).get(),
scope_ids_data,
ScopedCodes(il, j).get());
}
assert(l.size == l.capacity);
Expand Down Expand Up @@ -638,7 +654,7 @@ size_t OnDiskInvertedLists::merge_from(
size_t OnDiskInvertedLists::merge_from_1(
const InvertedLists* ils,
bool verbose) {
return merge_from(&ils, 1, verbose);
return merge_from_multiple(&ils, 1, verbose);
}

void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {
Expand Down
3 changes: 2 additions & 1 deletion faiss/invlists/OnDiskInvertedLists.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists {

// copy all inverted lists into *this, in compact form (without
// allocating slots)
size_t merge_from(
size_t merge_from_multiple(
const InvertedLists** ils,
int n_il,
bool shift_ids = false,
bool verbose = false);

/// same as merge_from for a single invlist
Expand Down
28 changes: 26 additions & 2 deletions tests/test_merge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ size_t nq = 100;
int nindex = 4;
int k = 10;
int nlist = 40;
int shard_size = nb / nindex;

struct CommonData {
std::vector<float> database;
Expand Down Expand Up @@ -100,7 +101,7 @@ int compare_merged(
auto il = new faiss::OnDiskInvertedLists(
index0->nlist, index0->code_size, filename.c_str());

il->merge_from(lists.data(), lists.size());
il->merge_from_multiple(lists.data(), lists.size(), shift_ids);

index0->replace_invlists(il, true);
index0->ntotal = ntotal;
Expand All @@ -110,11 +111,14 @@ int compare_merged(
nq, cd.queries.data(), k, newD.data(), newI.data());

size_t ndiff = 0;
bool adjust_ids = shift_ids && !standard_merge;
for (size_t i = 0; i < k * nq; i++) {
if (refI[i] != newI[i]) {
idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
if (refI[i] != new_id) {
ndiff++;
}
}

return ndiff;
}

Expand Down Expand Up @@ -220,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) {
int ndiff = compare_merged(&index_shards, false, false);
EXPECT_GE(0, ndiff);
}

// now use ondisk specific merge and use shift ids
TEST(MERGE, merge_flat_ondisk_3) {
faiss::IndexShards index_shards(d, false, false);
index_shards.own_indices = true;

std::vector<idx_t> ids;
for (int i = 0; i < nb; ++i) {
int id = i % shard_size;
ids.push_back(id);
}
for (int i = 0; i < nindex; i++) {
index_shards.add_shard(
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
}
EXPECT_TRUE(index_shards.is_trained);
index_shards.add_with_ids(nb, cd.database.data(), ids.data());
int ndiff = compare_merged(&index_shards, true, false);
EXPECT_GE(0, ndiff);
}

0 comments on commit b741509

Please sign in to comment.