Skip to content

Commit

Permalink
Introduce a setting to control whether to build graph
Browse files Browse the repository at this point in the history
Added new updatable index setting "build_vector_data_structure_threshold", which will be
considered when to build braph or not for native engines.
This is noop for lucene. This depends on use lucene format as prerequisite.
We don't need to add flag since it is only enable if lucene format is
already enabled.

Signed-off-by: Vijayan Balasubramanian <[email protected]>
  • Loading branch information
VijayanB committed Sep 4, 2024
1 parent d4af93e commit 25df982
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
* Add support for byte vector with Faiss Engine IVF algorithm [#2002](https://github.com/opensearch-project/k-NN/pull/2002)
### Enhancements
* Adds iterative graph build capability into a faiss index to improve the memory footprint during indexing and Integrates KNNVectorsFormat for native engines[#1950](https://github.com/opensearch-project/k-NN/pull/1950)
* Introduce new setting to configure whether to build vector data structure or not during segment creation [#2007](https://github.com/opensearch-project/k-NN/pull/2007)
### Bug Fixes
* Corrected search logic for scenario with non-existent fields in filter [#1874](https://github.com/opensearch-project/k-NN/pull/1874)
* Add script_fields context to KNNAllowlist [#1917] (https://github.com/opensearch-project/k-NN/pull/1917)
Expand Down
20 changes: 20 additions & 0 deletions src/main/java/org/opensearch/knn/index/KNNSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public class KNNSettings {
* Settings name
*/
public static final String KNN_SPACE_TYPE = "index.knn.space_type";
public static final String INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD = "index.knn.build_vector_data_structure_threshold";
public static final String KNN_ALGO_PARAM_M = "index.knn.algo_param.m";
public static final String KNN_ALGO_PARAM_EF_CONSTRUCTION = "index.knn.algo_param.ef_construction";
public static final String KNN_ALGO_PARAM_EF_SEARCH = "index.knn.algo_param.ef_search";
Expand All @@ -91,6 +92,9 @@ public class KNNSettings {
*/
public static final boolean KNN_DEFAULT_FAISS_AVX2_DISABLED_VALUE = false;
public static final String INDEX_KNN_DEFAULT_SPACE_TYPE = "l2";
public static final Integer INDEX_KNN_DEFAULT_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD = 0;
public static final Integer INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MIN = -1;
public static final Integer INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MAX = Integer.MAX_VALUE - 2;
public static final String INDEX_KNN_DEFAULT_SPACE_TYPE_FOR_BINARY = "hamming";
public static final Integer INDEX_KNN_DEFAULT_ALGO_PARAM_M = 16;
public static final Integer INDEX_KNN_DEFAULT_ALGO_PARAM_EF_SEARCH = 100;
Expand Down Expand Up @@ -130,6 +134,21 @@ public class KNNSettings {
Setting.Property.Deprecated
);

/**
* build_vector_data_structure_threshold - This parameter determines when to build vector data structure for knn fields during indexing
* and merging. Setting -1 (min) will skip building graph, whereas on any other values, the graph will be built if
* number of live docs in segment is greater than this threshold. Since max number of documents in a segment can
* be Integer.MAX_VALUE - 1, this setting will allow threshold to be up to 1 less than max number of documents in a segment
*/
public static final Setting<Integer> INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_SETTING = Setting.intSetting(
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD,
INDEX_KNN_DEFAULT_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD,
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MIN,
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MAX,
IndexScope,
Dynamic
);

/**
* M - the number of bi-directional links created for every new element during construction.
* Reasonable range for M is 2-100. Higher M work better on datasets with high intrinsic
Expand Down Expand Up @@ -446,6 +465,7 @@ private Setting<?> getSetting(String key) {
public List<Setting<?>> getSettings() {
List<Setting<?>> settings = Arrays.asList(
INDEX_KNN_SPACE_TYPE,
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_SETTING,
INDEX_KNN_ALGO_PARAM_M_SETTING,
INDEX_KNN_ALGO_PARAM_EF_CONSTRUCTION_SETTING,
INDEX_KNN_ALGO_PARAM_EF_SEARCH_SETTING,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.mapper.MapperService;
import org.opensearch.knn.index.KNNSettings;
import org.opensearch.knn.index.codec.KNN990Codec.NativeEngines990KnnVectorsFormat;
import org.opensearch.knn.index.codec.params.KNNScalarQuantizedVectorsFormatParams;
import org.opensearch.knn.index.codec.params.KNNVectorsFormatParams;
Expand Down Expand Up @@ -129,7 +131,21 @@ public KnnVectorsFormat getKnnVectorsFormatForField(final String field) {
}

private NativeEngines990KnnVectorsFormat nativeEngineVectorsFormat() {
return new NativeEngines990KnnVectorsFormat(new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()));
int buildVectorDatastructureThreshold = getBuildVectorDatastructureThresholdSetting(mapperService.get());
return new NativeEngines990KnnVectorsFormat(
new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()),
buildVectorDatastructureThreshold
);
}

private int getBuildVectorDatastructureThresholdSetting(final MapperService knnMapperService) {
final IndexSettings indexSettings = knnMapperService.getIndexSettings();
final Integer buildVectorDatastructureThreshold = indexSettings.getValue(
KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_SETTING
);
return buildVectorDatastructureThreshold != null
? buildVectorDatastructureThreshold
: KNNSettings.INDEX_KNN_DEFAULT_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.opensearch.knn.index.KNNSettings;

import java.io.IOException;

Expand All @@ -30,15 +31,20 @@ public class NativeEngines990KnnVectorsFormat extends KnnVectorsFormat {
/** The format for storing, reading, merging vectors on disk */
private static FlatVectorsFormat flatVectorsFormat;
private static final String FORMAT_NAME = "NativeEngines990KnnVectorsFormat";
private static int buildVectorDatastructureThreshold;

public NativeEngines990KnnVectorsFormat() {
super(FORMAT_NAME);
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer());
this(new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()));
}

public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat flatVectorsFormat) {
this(flatVectorsFormat, KNNSettings.INDEX_KNN_DEFAULT_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD);
}

public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat lucene99FlatVectorsFormat) {
public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat flatVectorsFormat, int buildVectorDatastructureThreshold) {
super(FORMAT_NAME);
flatVectorsFormat = lucene99FlatVectorsFormat;
NativeEngines990KnnVectorsFormat.flatVectorsFormat = flatVectorsFormat;
NativeEngines990KnnVectorsFormat.buildVectorDatastructureThreshold = buildVectorDatastructureThreshold;
}

/**
Expand All @@ -48,7 +54,7 @@ public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat lucene99FlatVect
*/
@Override
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state));
return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state), buildVectorDatastructureThreshold);
}

/**
Expand All @@ -63,6 +69,12 @@ public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOExce

@Override
public String toString() {
return "NativeEngines99KnnVectorsFormat(name=" + this.getClass().getSimpleName() + ", flatVectorsFormat=" + flatVectorsFormat + ")";
return "NativeEngines99KnnVectorsFormat(name="
+ this.getClass().getSimpleName()
+ ", flatVectorsFormat="
+ flatVectorsFormat
+ ", buildVectorDatastructureThreshold"
+ buildVectorDatastructureThreshold
+ ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

package org.opensearch.knn.index.codec.KNN990Codec;

import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.extern.log4j.Log4j2;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
Expand Down Expand Up @@ -54,6 +55,8 @@ public class NativeEngines990KnnVectorsWriter extends KnnVectorsWriter {
private final SegmentWriteState segmentWriteState;
private final FlatVectorsWriter flatVectorsWriter;
private final List<NativeEngineFieldVectorsWriter<?>> fields = new ArrayList<>();
@NonNull
private final Integer buildVectorDataStructureThreshold;
private boolean finished;
private final QuantizationService quantizationService = QuantizationService.getInstance();

Expand Down Expand Up @@ -245,12 +248,22 @@ private <T, C> void trainAndIndex(
: NativeIndexWriter.getWriter(fieldInfo, segmentWriteState);

knnVectorValues = vectorValuesRetriever.apply(vectorDataType, fieldInfo, VectorProcessingContext);

if (shouldSkipBuildingVectorDataStructure(knnVectorValues.totalLiveDocs())) {
log.info("Skip building vector data structure for field: ", fieldInfo.name);
return;
}
StopWatch stopWatch = new StopWatch();
stopWatch.start();
indexOperation.buildAndWrite(writer, knnVectorValues);
long time_in_millis = stopWatch.totalTime().millis();
graphBuildTime.incrementBy(time_in_millis);
log.warn("Graph build took " + time_in_millis + " ms for " + operationName);
}

private boolean shouldSkipBuildingVectorDataStructure(final long docCount) {
if (buildVectorDataStructureThreshold < 0) {
return true;
}
return docCount < buildVectorDataStructureThreshold;
}
}
150 changes: 150 additions & 0 deletions src/test/java/org/opensearch/knn/index/OpenSearchIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.google.common.primitives.Floats;
import java.util.Locale;
import lombok.SneakyThrows;
import org.apache.hc.core5.http.ParseException;
import org.junit.BeforeClass;
import org.opensearch.knn.KNNRestTestCase;
import org.opensearch.knn.KNNResult;
Expand All @@ -41,6 +42,8 @@
import java.util.TreeMap;

import static org.hamcrest.Matchers.containsString;
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MAX;
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MIN;

public class OpenSearchIT extends KNNRestTestCase {

Expand Down Expand Up @@ -483,4 +486,151 @@ public void testIndexingVectorValidation_updateVectorWithNull() throws Exception
assertArrayEquals(vectorForDocumentOne, vectorRestoreInitialValue);
}

public void testKNNIndex_whenBuildGraphThresholdIsPresent_thenGetThresholdValue() throws Exception {
final Integer buildVectorDataStructureThreshold = randomIntBetween(
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MIN,
INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD_MAX
);
final Settings settings = Settings.builder().put(buildKNNIndexSettings(buildVectorDataStructureThreshold)).build();
final String knnIndexMapping = createKnnIndexMapping(FIELD_NAME, KNNEngine.getMaxDimensionByEngine(KNNEngine.DEFAULT));
final String indexName = "test-index-with-build-graph-settings";
createKnnIndex(indexName, settings, knnIndexMapping);
final String buildVectorDataStructureThresholdSetting = getIndexSettingByName(
indexName,
KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD
);
assertNotNull("build_vector_data_structure_threshold index setting is not found", buildVectorDataStructureThresholdSetting);
assertEquals(
"incorrect setting for build_vector_data_structure_threshold",
buildVectorDataStructureThreshold,
Integer.valueOf(buildVectorDataStructureThresholdSetting)
);
deleteKNNIndex(indexName);
}

public void testKNNIndex_whenBuildThresholdIsNotProvided_thenShouldNotReturnSetting() throws Exception {
final String knnIndexMapping = createKnnIndexMapping(FIELD_NAME, KNNEngine.getMaxDimensionByEngine(KNNEngine.DEFAULT));
final String indexName = "test-index-with-build-graph-settings";
createKnnIndex(indexName, knnIndexMapping);
final String buildVectorDataStructureThresholdSetting = getIndexSettingByName(
indexName,
KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD
);
assertNull(
"build_vector_data_structure_threshold index setting should not be added in index setting",
buildVectorDataStructureThresholdSetting
);
deleteKNNIndex(indexName);
}

public void testKNNIndex_whenGetIndexSettingWithDefaultIsCalled_thenReturnDefaultBuildGraphThresholdValue() throws Exception {
final String knnIndexMapping = createKnnIndexMapping(FIELD_NAME, KNNEngine.getMaxDimensionByEngine(KNNEngine.DEFAULT));
final String indexName = "test-index-with-build-vector-graph-settings";
createKnnIndex(indexName, knnIndexMapping);
final String buildVectorDataStructureThresholdSetting = getIndexSettingByName(
indexName,
KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD,
true
);
assertNotNull("build_vector_data_structure index setting is not found", buildVectorDataStructureThresholdSetting);
assertEquals(
"incorrect default setting for build_vector_data_structure_threshold",
KNNSettings.INDEX_KNN_DEFAULT_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD,
Integer.valueOf(buildVectorDataStructureThresholdSetting)
);
deleteKNNIndex(indexName);
}

/*
For this testcase, we will create index with setting build_vector_data_structure_threshold as -1, then index few documents, perform knn search,
then, confirm no hits since there are no graph. In next step, update setting to 0, force merge segment to 1, perform knn search and confirm expected
hits are returned.
*/
public void testKNNIndex_whenBuildVectorGraphThresholdIsProvidedEndToEnd_thenBuildGraphBasedOnSetting() throws Exception {
final String indexName = "test-index-1";
final String fieldName1 = "test-field-1";
final String fieldName2 = "test-field-2";

final Integer dimension = testData.indexData.vectors[0].length;
final Settings knnIndexSettings = buildKNNIndexSettings(-1);

// Create an index
final XContentBuilder builder = XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject(fieldName1)
.field("type", "knn_vector")
.field("dimension", dimension)
.startObject(KNNConstants.KNN_METHOD)
.field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
.field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName())
.startObject(KNNConstants.PARAMETERS)
.endObject()
.endObject()
.endObject()
.startObject(fieldName2)
.field("type", "knn_vector")
.field("dimension", dimension)
.startObject(KNNConstants.KNN_METHOD)
.field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
.field(KNNConstants.KNN_ENGINE, KNNEngine.FAISS.getName())
.startObject(KNNConstants.PARAMETERS)
.endObject()
.endObject()
.endObject()
.endObject()
.endObject();

createKnnIndex(indexName, knnIndexSettings, builder.toString());

// Index the test data
for (int i = 0; i < testData.indexData.docs.length; i++) {
addKnnDoc(
indexName,
Integer.toString(testData.indexData.docs[i]),
ImmutableList.of(fieldName1, fieldName2),
ImmutableList.of(
Floats.asList(testData.indexData.vectors[i]).toArray(),
Floats.asList(testData.indexData.vectors[i]).toArray()
)
);
}

refreshAllIndices();
// Assert we have the right number of documents in the index
assertEquals(testData.indexData.docs.length, getDocCount(indexName));

final List<KNNResult> nmslibNeighbors = getResults(indexName, fieldName1, testData.queries[0], 1);
assertEquals("unexpected neighbors are returned", 0, nmslibNeighbors.size());

final List<KNNResult> faissNeighbors = getResults(indexName, fieldName2, testData.queries[0], 1);
assertEquals("unexpected neighbors are returned", 0, faissNeighbors.size());

// update build vector data structure setting
updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD, 0));
forceMergeKnnIndex(indexName, 1);

final int k = 10;
for (int i = 0; i < testData.queries.length; i++) {
// Search nmslib field
final Response response = searchKNNIndex(indexName, new KNNQueryBuilder(fieldName1, testData.queries[i], k), k);
final String responseBody = EntityUtils.toString(response.getEntity());
final List<KNNResult> nmslibValidNeighbors = parseSearchResponse(responseBody, fieldName1);
assertEquals(k, nmslibValidNeighbors.size());
// Search faiss field
final List<KNNResult> faissValidNeighbors = getResults(indexName, fieldName2, testData.queries[i], k);
assertEquals(k, faissValidNeighbors.size());
}

// Delete index
deleteKNNIndex(indexName);
}

private List<KNNResult> getResults(final String indexName, final String fieldName, final float[] vector, final int k)
throws IOException, ParseException {
final Response searchResponseField = searchKNNIndex(indexName, new KNNQueryBuilder(fieldName, vector, k), k);
final String searchResponseBody = EntityUtils.toString(searchResponseField.getEntity());
return parseSearchResponse(searchResponseBody, fieldName);
}

}
11 changes: 11 additions & 0 deletions src/testFixtures/java/org/opensearch/knn/KNNRestTestCase.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@
import static org.opensearch.knn.TestUtils.computeGroundTruthValues;

import static org.opensearch.knn.common.KNNConstants.VECTOR_DATA_TYPE_FIELD;
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD;
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX;
import static org.opensearch.knn.index.SpaceType.L2;
import static org.opensearch.knn.index.memory.NativeMemoryCacheManager.GRAPH_COUNT;
import static org.opensearch.knn.index.engine.KNNEngine.FAISS;
Expand Down Expand Up @@ -762,6 +764,15 @@ protected Settings getKNNSegmentReplicatedIndexSettings() {
.build();
}

protected Settings buildKNNIndexSettings(int buildVectorDatastructureThreshold) {
return Settings.builder()
.put("number_of_shards", 1)
.put("number_of_replicas", 0)
.put(KNN_INDEX, true)
.put(INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD, buildVectorDatastructureThreshold)
.build();
}

@SneakyThrows
protected int getDataNodeCount() {
Request request = new Request("GET", "_nodes/stats?filter_path=nodes.*.roles");
Expand Down

0 comments on commit 25df982

Please sign in to comment.