-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Support for Lucene Byte Sized Vector (#971)
* Add Indexing Support for Lucene Byte Sized Vector (#937) * Add Indexing Support for Lucene Byte Sized Vector Signed-off-by: Naveen Tatikonda <[email protected]> * Add tests for Indexing Signed-off-by: Naveen Tatikonda <[email protected]> * Add CHANGELOG Signed-off-by: Naveen Tatikonda <[email protected]> * Address Review Comments Signed-off-by: Naveen Tatikonda <[email protected]> --------- Signed-off-by: Naveen Tatikonda <[email protected]> * Add Querying Support to Lucene Byte Sized Vector (#956) * Add Querying Support to Lucene Byte Sized Vector Signed-off-by: Naveen Tatikonda <[email protected]> * Add CHANGELOG Signed-off-by: Naveen Tatikonda <[email protected]> * Address Review Comments Signed-off-by: Naveen Tatikonda <[email protected]> --------- Signed-off-by: Naveen Tatikonda <[email protected]> * Add DocValues Support for Lucene Byte Sized Vector (#953) Signed-off-by: Naveen Tatikonda <[email protected]> * Update Release Notes Signed-off-by: Naveen Tatikonda <[email protected]> --------- Signed-off-by: Naveen Tatikonda <[email protected]>
- Loading branch information
1 parent
3b318ce
commit bf04854
Showing
27 changed files
with
1,598 additions
and
189 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
120 changes: 120 additions & 0 deletions
120
src/main/java/org/opensearch/knn/index/VectorDataType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.knn.index; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
import org.apache.lucene.document.FieldType; | ||
import org.apache.lucene.document.KnnByteVectorField; | ||
import org.apache.lucene.document.KnnVectorField; | ||
import org.apache.lucene.index.VectorSimilarityFunction; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.opensearch.knn.index.codec.util.KNNVectorSerializer; | ||
import org.opensearch.knn.index.codec.util.KNNVectorSerializerFactory; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.util.Arrays; | ||
import java.util.Locale; | ||
import java.util.Objects; | ||
import java.util.stream.Collectors; | ||
|
||
import static org.opensearch.knn.common.KNNConstants.VECTOR_DATA_TYPE_FIELD; | ||
|
||
/** | ||
* Enum contains data_type of vectors and right now only supported for lucene engine in k-NN plugin. | ||
* We have two vector data_types, one is float (default) and the other one is byte. | ||
*/ | ||
@AllArgsConstructor | ||
public enum VectorDataType { | ||
BYTE("byte") { | ||
|
||
@Override | ||
public FieldType createKnnVectorFieldType(int dimension, VectorSimilarityFunction vectorSimilarityFunction) { | ||
return KnnByteVectorField.createFieldType(dimension, vectorSimilarityFunction); | ||
} | ||
|
||
@Override | ||
public float[] getVectorFromDocValues(BytesRef binaryValue) { | ||
float[] vector = new float[binaryValue.length]; | ||
int i = 0; | ||
int j = binaryValue.offset; | ||
|
||
while (i < binaryValue.length) { | ||
vector[i++] = binaryValue.bytes[j++]; | ||
} | ||
return vector; | ||
} | ||
}, | ||
FLOAT("float") { | ||
|
||
@Override | ||
public FieldType createKnnVectorFieldType(int dimension, VectorSimilarityFunction vectorSimilarityFunction) { | ||
return KnnVectorField.createFieldType(dimension, vectorSimilarityFunction); | ||
} | ||
|
||
@Override | ||
public float[] getVectorFromDocValues(BytesRef binaryValue) { | ||
ByteArrayInputStream byteStream = new ByteArrayInputStream(binaryValue.bytes, binaryValue.offset, binaryValue.length); | ||
final KNNVectorSerializer vectorSerializer = KNNVectorSerializerFactory.getSerializerByStreamContent(byteStream); | ||
return vectorSerializer.byteToFloatArray(byteStream); | ||
} | ||
|
||
}; | ||
|
||
public static final String SUPPORTED_VECTOR_DATA_TYPES = Arrays.stream(VectorDataType.values()) | ||
.map(VectorDataType::getValue) | ||
.collect(Collectors.joining(",")); | ||
@Getter | ||
private final String value; | ||
|
||
/** | ||
* Creates a KnnVectorFieldType based on the VectorDataType using the provided dimension and | ||
* VectorSimilarityFunction. | ||
* | ||
* @param dimension Dimension of the vector | ||
* @param vectorSimilarityFunction VectorSimilarityFunction for a given spaceType | ||
* @return FieldType | ||
*/ | ||
public abstract FieldType createKnnVectorFieldType(int dimension, VectorSimilarityFunction vectorSimilarityFunction); | ||
|
||
/** | ||
* Deserializes float vector from doc values binary value. | ||
* | ||
* @param binaryValue Binary Value of DocValues | ||
* @return float vector deserialized from binary value | ||
*/ | ||
public abstract float[] getVectorFromDocValues(BytesRef binaryValue); | ||
|
||
/** | ||
* Validates if given VectorDataType is in the list of supported data types. | ||
* @param vectorDataType VectorDataType | ||
* @return the same VectorDataType if it is in the supported values | ||
* throws Exception if an invalid value is provided. | ||
*/ | ||
public static VectorDataType get(String vectorDataType) { | ||
Objects.requireNonNull( | ||
vectorDataType, | ||
String.format( | ||
Locale.ROOT, | ||
"[%s] should not be null. Supported types are [%s]", | ||
VECTOR_DATA_TYPE_FIELD, | ||
SUPPORTED_VECTOR_DATA_TYPES | ||
) | ||
); | ||
try { | ||
return VectorDataType.valueOf(vectorDataType.toUpperCase(Locale.ROOT)); | ||
} catch (Exception e) { | ||
throw new IllegalArgumentException( | ||
String.format( | ||
Locale.ROOT, | ||
"Invalid value provided for [%s] field. Supported values are [%s]", | ||
VECTOR_DATA_TYPE_FIELD, | ||
SUPPORTED_VECTOR_DATA_TYPES | ||
) | ||
); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.