From 288491b5a50fac8dc8bd1451db72e8fb65df3078 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 4 Dec 2023 10:16:55 +0000 Subject: [PATCH] Add SQfp16 support to ARM Architecture Signed-off-by: Naveen Tatikonda Signed-off-by: EC2 Default User --- .../faiss-hnsw-sqfp16/index.json | 4 +- .../faiss-hnsw-sqfp16/test.yml | 11 +- faiss-patches/CMakeLists.patch | 12 + faiss-patches/ScalarQuantizer.patch | 740 ++++++++++++++++++ faiss-patches/fp16-fp16.h | 33 + faiss-patches/fp16.patch | 13 + jni/src/faiss_wrapper.cpp | 2 +- jni/tests/faiss_wrapper_test.cpp | 84 ++ .../org/opensearch/knn/index/util/Faiss.java | 129 ++- 9 files changed, 955 insertions(+), 73 deletions(-) create mode 100644 faiss-patches/CMakeLists.patch create mode 100644 faiss-patches/ScalarQuantizer.patch create mode 100644 faiss-patches/fp16-fp16.h create mode 100644 faiss-patches/fp16.patch diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/index.json index b7ecbddfb..8143b6f17 100644 --- a/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/index.json +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/index.json @@ -3,7 +3,7 @@ "index": { "knn": true, "number_of_shards": 24, - "number_of_replicas": 1 + "number_of_replicas": 0 } }, "mappings": { @@ -24,4 +24,4 @@ } } } -} \ No newline at end of file +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/test.yml index 50460f60b..62e92c54a 100644 --- a/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/test.yml +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/test.yml @@ -2,19 +2,20 @@ endpoint: "localhost" test_name: "Faiss HNSW SQfp16 Test" test_id: "Faiss HNSW SQfp16 Test" num_runs: 10 -show_runs: false +timeout: 600 +show_runs: true steps: - name: delete_index index_name: target_index - name: create_index index_name: target_index - index_spec: /home/ec2-user/[PATH]/index.json + index_spec: /home/ec2-user/k-NN/benchmarks/perf-tool/release-configs/faiss-hnsw-sqfp16/index.json - name: ingest index_name: target_index field_name: target_field bulk_size: 500 dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + dataset_path: /home/ec2-user/data/sift-128-euclidean.hdf5 - name: refresh_index index_name: target_index - name: force_merge @@ -29,6 +30,6 @@ steps: index_name: target_index field_name: target_field dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + dataset_path: /home/ec2-user/data/sift-128-euclidean.hdf5 neighbors_format: hdf5 - neighbors_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + neighbors_path: /home/ec2-user/data/sift-128-euclidean.hdf5 diff --git a/faiss-patches/CMakeLists.patch b/faiss-patches/CMakeLists.patch new file mode 100644 index 000000000..354f7198e --- /dev/null +++ b/faiss-patches/CMakeLists.patch @@ -0,0 +1,12 @@ +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 5e635a53..46434056 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -184,6 +184,7 @@ set(FAISS_HEADERS + utils/extra_distances-inl.h + utils/extra_distances.h + utils/fp16-fp16c.h ++ utils/fp16-fp16.h + utils/fp16-inl.h + utils/fp16.h + utils/hamming-inl.h diff --git a/faiss-patches/ScalarQuantizer.patch b/faiss-patches/ScalarQuantizer.patch new file mode 100644 index 000000000..d9b5438ed --- /dev/null +++ b/faiss-patches/ScalarQuantizer.patch @@ -0,0 +1,740 @@ +diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp +index a3cf4c74..28d63851 100644 +--- a/faiss/impl/ScalarQuantizer.cpp ++++ b/faiss/impl/ScalarQuantizer.cpp +@@ -26,6 +26,9 @@ + #include + #include + ++#include ++#include ++ + namespace faiss { + + /******************************************************************* +@@ -167,7 +170,6 @@ struct Codec6bit { + } + + #ifdef __AVX2__ +- + /* Load 6 bytes that represent 8 6-bit values, return them as a + * 8*32 bit vector register */ + static __m256i load6(const uint16_t* code16) { +@@ -249,7 +251,6 @@ struct QuantizerTemplate : ScalarQuantizer::SQuantizer { + }; + + #ifdef __AVX2__ +- + template + struct QuantizerTemplate : QuantizerTemplate { + QuantizerTemplate(size_t d, const std::vector& trained) +@@ -333,8 +334,11 @@ struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer { + QuantizerFP16(size_t d, const std::vector& /* unused */) : d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { +- for (size_t i = 0; i < d; i++) { ++ //std::cout << "Naveen: Inside <1> encode_vector" << std::flush; ++ //std::cout << "Naveen: Encoded vector values: " << std::flush; ++ for (size_t i = 0; i < d; i++) { + ((uint16_t*)code)[i] = encode_fp16(x[i]); ++ // std::cout << encode_fp16(x[i]) << ", " << std::flush; + } + } + +@@ -364,6 +368,76 @@ struct QuantizerFP16<8> : QuantizerFP16<1> { + + #endif + ++/* ++#ifdef __aarch64__ ++ ++template <> ++struct QuantizerFP16<8> : QuantizerFP16<1> { ++ QuantizerFP16(size_t d, const std::vector& trained) ++ : QuantizerFP16<1>(d, trained) {} ++ ++ float32x4_t reconstruct_8_components(const uint8_t* code, int i) const { ++ //std::cout << "Naveen: Inside <8> reconstruct_component" << std::flush; ++ //int32x4_t codei = vld1q_s32((const int32_t*)(code + 2 * i)); // TODO: (const int32_t*) or (const int32x4_t*) ++ //return vreinterpretq_f32_s32(codei); ++ //int16x4_t codei = vld1_s16((const int16_t*)(code + 2 * i)); ++ //return vreinterpretq_f32_s32(vmovl_s16(codei)); ++ uint16x4_t codei = vld1_u16((const uint16_t*)(code + 2 * i)); ++ uint16_t(*ArrPtr)[4] = (uint16_t(*)[4])&codei; ++ std::cout << " Naveen: reconstruct_8_component code values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ ++ ++ //float16x4_t temp = vreinterpret_f16_u16(codei); ++ //float(*ArrPtr1)[4] = (float(*)[4])&temp; ++ //std::cout << " Naveen: reconstruct_8_component temp values: " << std::flush; ++ //for (int32_t i = 0; i < 4; i++) ++ //std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ return vcvt_f32_f16(vreinterpret_f16_u16(codei)); ++ ++ //return vreinterpretq_f32_u32(vmovl_u16(codei)); ++ } ++}; ++#endif */ ++ ++#ifdef __aarch64__ ++ ++template <> ++struct QuantizerFP16<8> : QuantizerFP16<1> { ++ QuantizerFP16(size_t d, const std::vector& trained) ++ : QuantizerFP16<1>(d, trained) {} ++ ++ float32x4x2_t reconstruct_8_components(const uint8_t* code, int i) const { ++ uint16x4x2_t codei = vld2_u16((const uint16_t*)(code + 2 * i)); ++ //return vzipq_f32(vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))); ++ ++ /*float32x4_t d1 = vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])); ++ float32x4_t d2 = vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1])); ++ float(*ArrPtr1)[4] = (float(*)[4])&d1; ++ std::cout << " Naveen: reconstruct_8_component d1 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ ++ float(*ArrPtr2)[4] = (float(*)[4])&d2; ++ std::cout << " Naveen: reconstruct_8_component d2 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr2)+i) << ", " << std::flush; ++ */ ++ ++ /*float32x4x2_t dummy = vzipq_f32(vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))); ++ float(*ArrPtr1)[8] = (float(*)[8])&dummy; ++ std::cout << " Naveen: reconstruct_8_component dummy values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ */ ++ //return vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1])); ++ return vzipq_f32(vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))); ++ } ++}; ++#endif ++ ++ + /******************************************************************* + * 8bit_direct quantizer + *******************************************************************/ +@@ -411,31 +485,33 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> { + + #endif + +-template ++template + ScalarQuantizer::SQuantizer* select_quantizer_1( + QuantizerType qtype, + size_t d, + const std::vector& trained) { + switch (qtype) { + case ScalarQuantizer::QT_8bit: +- return new QuantizerTemplate( ++ return new QuantizerTemplate( + d, trained); + case ScalarQuantizer::QT_6bit: +- return new QuantizerTemplate( ++ return new QuantizerTemplate( + d, trained); + case ScalarQuantizer::QT_4bit: +- return new QuantizerTemplate( ++ return new QuantizerTemplate( + d, trained); + case ScalarQuantizer::QT_8bit_uniform: +- return new QuantizerTemplate( +- d, trained); ++ return new QuantizerTemplate( ++ d, trained); + case ScalarQuantizer::QT_4bit_uniform: +- return new QuantizerTemplate( ++ return new QuantizerTemplate( + d, trained); + case ScalarQuantizer::QT_fp16: +- return new QuantizerFP16(d, trained); ++ //std::cout << "Naveen: Inside QT_fp16 switch case" << std::flush; ++ return new QuantizerFP16(d, trained); + case ScalarQuantizer::QT_8bit_direct: +- return new Quantizer8bitDirect(d, trained); ++ return new Quantizer8bitDirect(d, trained); ++ + } + FAISS_THROW_MSG("unknown qtype"); + } +@@ -648,7 +724,11 @@ struct SimilarityL2<1> { + } + + float result() { +- return accu; ++ std::cout << "Naveen: Result is: " << accu << std::flush; ++ /* std::ofstream MyFile; ++ MyFile.open("/home/ec2-user/k-NN/compute_distance.txt", std::ios_base::app); ++ MyFile << accu << "\n";*/ ++ return accu; + } + }; + +@@ -691,6 +771,205 @@ struct SimilarityL2<8> { + + #endif + ++/* ++#ifdef __aarch64__ ++template <> ++struct SimilarityL2<8> { ++ static constexpr int simdwidth = 8; ++ static constexpr MetricType metric_type = METRIC_L2; ++ ++ const float *y, *yi; ++ ++ explicit SimilarityL2(const float* y) : y(y) {} ++ float32x4_t accu8; ++ void begin_8() { ++ accu8 = vdupq_n_f32(0.0f); ++ yi = y; ++ } ++ ++ void add_8_components(float32x4_t x) { ++ //std::cout << "Naveen: Inside L2<8> add_8_components" << std::flush; ++ float32x4_t yiv = vld1q_f32(yi); ++ yi += 4; ++ float32x4_t tmp = vsubq_f32(yiv, x); ++ ++ float(*ArrPtr1)[4] = (float(*)[4])&tmp; ++ std::cout << " Naveen: tmp values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ ++ float(*ArrPtr2)[4] = (float(*)[4])&x; ++ std::cout << " Naveen: x values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr2)+i) << ", " << std::flush; ++ ++ accu8 = vaddq_f32(accu8, vmulq_f32(tmp, tmp)); ++ ++ float(*ArrPtr)[4] = (float(*)[4])&accu8; ++ std::cout << " Naveen: Accumulator values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ } ++ ++ void add_8_components_2(float32x4_t x, float32x4_t y) { ++ //std::cout << "Naveen: Inside L2<8> add_8_components_2" << std::flush; ++ float32x4_t tmp = vsubq_f32(y, x); ++ accu8 = vaddq_f32(accu8, vmulq_f32(tmp, tmp)); ++ } ++ ++ float result_8() { ++ //std::cout << "Naveen: Inside L2<8> result_8" << std::flush; ++ float32x4_t sum = vpaddq_f32(accu8, accu8); ++ float(*ArrPtr)[4] = (float(*)[4])∑ ++ std::cout << " Naveen: sum values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ ++ ++ float32x4_t sum2 = vpaddq_f32(sum, sum); ++ float(*ArrPtr1)[4] = (float(*)[4])&sum2; ++ std::cout << " Naveen: sum2 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ ++ //return vgetq_lane_f32(sum2, 3) + vgetq_lane_f32(sum2, 1); ++ float f = vgetq_lane_f32(sum2, 1); ++ std::cout << " Naveen: result_8 value: " << f << std::flush; ++ //std::ofstream MyFile; ++ //MyFile.open("/home/ec2-user/k-NN/compute_distance.txt", std::ios_base::app); ++ //MyFile << f << "\n"; ++ return f; ++ ++ } ++}; ++ ++#endif ++*/ ++ ++#ifdef __aarch64__ ++template <> ++struct SimilarityL2<8> { ++ static constexpr int simdwidth = 8; ++ static constexpr MetricType metric_type = METRIC_L2; ++ ++ const float *y, *yi; ++ explicit SimilarityL2(const float* y) : y(y) {} ++ float32x4x2_t accu8; ++ ++ void begin_8() { ++ accu8 = vzipq_f32(vdupq_n_f32(0.0f),vdupq_n_f32(0.0f)); ++ yi = y; ++ } ++ ++void add_8_components(float32x4x2_t x) { ++ //std::cout << "Naveen: Inside L2<8> add_8_components" << std::flush; ++ float32x4x2_t yiv = vld1q_f32_x2(yi); ++ ++ /*float(*ArrPtr)[8] = (float(*)[8])&yiv; ++ std::cout << " Naveen: yiv values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ ++ float(*ArrPtr2)[8] = (float(*)[8])&x; ++ std::cout << " Naveen: x values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr2)+i) << ", " << std::flush; ++ */ ++ yi += 8; ++ ++ float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); ++ float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); ++ ++ float32x4_t accu8_0 = vaddq_f32(accu8.val[0], vmulq_f32(sub0,sub0)); ++ float32x4_t accu8_1 = vaddq_f32(accu8.val[1], vmulq_f32(sub1,sub1)); ++ float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1); ++ accu8 = vuzpq_f32(accu8_temp.val[0],accu8_temp.val[1]); ++ ++ /*float(*ArrPtr3)[4] = (float(*)[4])&accu8_0; ++ std::cout << " Naveen: accu8_0 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr3)+i) << ", " << std::flush; ++ */ ++ ++ //float32x4x2_t tmp = vzipq_f32(vsubq_f32(yiv.val[0], x.val[0]), vsubq_f32(yiv.val[1], x.val[1])); ++ //float32x4x2_t tmp = vtrnq_f32(sub0,sub1); ++ /*float(*ArrPtr1)[8] = (float(*)[8])&accu8; ++ std::cout << " Naveen: accu8 values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ */ ++ //accu8 = tmp; ++} ++ ++void add_8_components_2(float32x4x2_t x, float32x4x2_t y) { ++ //std::cout << "Naveen: Inside L2<8> add_8_components_2" << std::flush; ++ //float32x4x2_t tmp = vzipq_f32(vsubq_f32(y.val[0], x.val[0]), vsubq_f32(y.val[1], x.val[1])); ++ float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); ++ float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); ++ ++ float32x4_t accu8_0 = vaddq_f32(accu8.val[0], vmulq_f32(sub0,sub0)); ++ float32x4_t accu8_1 = vaddq_f32(accu8.val[1], vmulq_f32(sub1,sub1)); ++ float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1); ++ accu8 = vuzpq_f32(accu8_temp.val[0],accu8_temp.val[1]); ++ ++} ++ ++float result_8() { ++ //std::cout << "Naveen: Inside L2<8> result_8" << std::flush; ++ /* ++ float32x4x2_t sum_tmp = vzipq_f32(vpaddq_f32(accu8.val[0], accu8.val[0]), vpaddq_f32(accu8.val[1], accu8.val[1])); ++ float32x4x2_t sum = vuzpq_f32(sum_tmp.val[0], sum_tmp.val[1]); ++ float32x4x2_t sum2_tmp = vzipq_f32(vpaddq_f32(sum.val[0], sum.val[0]), vpaddq_f32(sum.val[1], sum.val[1])); ++ float32x4x2_t sum2 = vuzpq_f32(sum2_tmp.val[0], sum2_tmp.val[1]); ++ return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); ++ */ ++ ++ float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); ++ float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); ++ float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); ++ float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); ++ return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); ++ //std::cout << " Naveen: Result_8 value: " << f << std::flush; ++ //return f; ++ ++ /* ++ float32x4_t sum_tmp_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); ++ float32x4_t sum_tmp_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); ++ ++ float(*ArrPtr3)[4] = (float(*)[4])&sum_tmp_0; ++ std::cout << " Naveen: sum_tmp_0 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr3)+i) << ", " << std::flush; ++ ++ float(*ArrPtr4)[4] = (float(*)[4])&sum_tmp_1; ++ std::cout << " Naveen: sum_tmp_1 values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr4)+i) << ", " << std::flush; ++ ++ float(*ArrPtr)[8] = (float(*)[8])&accu8; ++ std::cout << " Naveen: accu8 values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ ++ float(*ArrPtr1)[8] = (float(*)[8])∑ ++ std::cout << " Naveen: sum values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ ++ float(*ArrPtr2)[8] = (float(*)[8])&sum2; ++ std::cout << " Naveen: sum2 values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr2)+i) << ", " << std::flush; ++ ++ return 0.0f; ++ */ ++} ++}; ++#endif ++ ++ ++ ++ + template + struct SimilarityIP {}; + +@@ -762,6 +1041,60 @@ struct SimilarityIP<8> { + }; + #endif + ++#ifdef __aarch64__ ++ ++template <> ++struct SimilarityIP<8> { ++ static constexpr int simdwidth = 8; ++ static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; ++ ++ const float *y, *yi; ++ ++ explicit SimilarityIP(const float* y) : y(y) {} ++ float32x4x2_t accu8; ++ ++ void begin_8() { ++ accu8 = vzipq_f32(vdupq_n_f32(0.0f),vdupq_n_f32(0.0f)); ++ yi = y; ++ } ++ ++ void add_8_components(float32x4x2_t x) { ++ //std::cout << "Naveen: Inside IP<8> add_8_components" << std::flush; ++ float32x4x2_t yiv = vld1q_f32_x2(yi); ++ yi += 8; ++ ++ float32x4_t accu8_0 = vaddq_f32(accu8.val[0], vmulq_f32(yiv.val[0], x.val[0])); ++ float32x4_t accu8_1 = vaddq_f32(accu8.val[1], vmulq_f32(yiv.val[1], x.val[1])); ++ float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1); ++ accu8 = vuzpq_f32(accu8_temp.val[0],accu8_temp.val[1]); ++ ++ /*float(*ArrPtr1)[8] = (float(*)[8])&accu8; ++ std::cout << " Naveen: accu8 values: " << std::flush; ++ for (int32_t i = 0; i < 8; i++) ++ std::cout << *((*ArrPtr1)+i) << ", " << std::flush; ++ */ ++ } ++ ++ void add_8_components_2(float32x4x2_t x1, float32x4x2_t x2) { ++ //std::cout << "Naveen: Inside IP<8> add_8_components_2" << std::flush; ++ float32x4_t accu8_0 = vaddq_f32(accu8.val[0], vmulq_f32(x1.val[0], x2.val[0])); ++ float32x4_t accu8_1 = vaddq_f32(accu8.val[1], vmulq_f32(x1.val[1], x2.val[1])); ++ float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1); ++ accu8 = vuzpq_f32(accu8_temp.val[0],accu8_temp.val[1]); ++ } ++ ++ float result_8() { ++ //std::cout << "Naveen: Inside IP<8> result_8" << std::flush; ++ float32x4x2_t sum_tmp = vzipq_f32(vpaddq_f32(accu8.val[0], accu8.val[0]), vpaddq_f32(accu8.val[1], accu8.val[1])); ++ float32x4x2_t sum = vuzpq_f32(sum_tmp.val[0], sum_tmp.val[1]); ++ float32x4x2_t sum2_tmp = vzipq_f32(vpaddq_f32(sum.val[0], sum.val[0]), vpaddq_f32(sum.val[1], sum.val[1])); ++ float32x4x2_t sum2 = vuzpq_f32(sum2_tmp.val[0], sum2_tmp.val[1]); ++ return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); ++ } ++}; ++#endif ++ ++ + /******************************************************************* + * DistanceComputer: combines a similarity and a quantizer to do + * code-to-vector or code-to-code comparisons +@@ -858,12 +1191,121 @@ struct DCTemplate : SQDistanceComputer { + } + + float query_to_code(const uint8_t* code) const final { +- return compute_distance(q, code); ++ return compute_distance(q, code); + } + }; + + #endif + ++/* ++#ifdef __aarch64__ ++ ++template ++struct DCTemplate : SQDistanceComputer { ++ using Sim = Similarity; ++ Quantizer quant; ++ ++ DCTemplate(size_t d, const std::vector& trained) ++ : quant(d, trained) {} ++ ++ float compute_distance(const float* x, const uint8_t* code) const { ++ //std::cout << "Naveen: Inside compute_distance" << std::flush; ++ Similarity sim(x); ++ sim.begin_8(); ++ for (size_t i = 0; i < quant.d; i += 4) { ++ float32x4_t xi = quant.reconstruct_8_components(code, i); ++ float(*ArrPtr)[4] = (float(*)[4])ξ ++ std::cout << " Naveen: Extracted reconstruct_8_component values: " << std::flush; ++ for (int32_t i = 0; i < 4; i++) ++ std::cout << *((*ArrPtr)+i) << ", " << std::flush; ++ sim.add_8_components(xi); ++ } ++ return sim.result_8(); ++ } ++ ++ float compute_code_distance(const uint8_t* code1, const uint8_t* code2) ++ const { ++ //std::cout << "Naveen: Inside compute_code_distance" << std::flush; ++ Similarity sim(nullptr); ++ sim.begin_8(); ++ for (size_t i = 0; i < quant.d; i += 8) { ++ float32x4_t x1 = quant.reconstruct_8_components(code1, i); ++ float32x4_t x2 = quant.reconstruct_8_components(code2, i); ++ sim.add_8_components_2(x1, x2); ++ } ++ return sim.result_8(); ++ } ++ ++ void set_query(const float* x) final { ++ q = x; ++ } ++ ++ float symmetric_dis(idx_t i, idx_t j) override { ++ //std::cout << "Naveen: Inside symmetric_dis" << std::flush; ++ return compute_code_distance( ++ codes + i * code_size, codes + j * code_size); ++ } ++ ++ float query_to_code(const uint8_t* code) const final { ++ //std::cout << "Naveen: Inside query_to_code" << std::flush; ++ return compute_distance(q, code); ++ } ++}; ++ ++#endif ++*/ ++ ++#ifdef __aarch64__ ++ ++template ++struct DCTemplate : SQDistanceComputer { ++ using Sim = Similarity; ++ ++ Quantizer quant; ++ ++ DCTemplate(size_t d, const std::vector& trained) ++ : quant(d, trained) {} ++ float compute_distance(const float* x, const uint8_t* code) const { ++ //std::cout << "Naveen: Inside compute_distance" << std::flush; ++ Similarity sim(x); ++ sim.begin_8(); ++ for (size_t i = 0; i < quant.d; i += 8) { ++ float32x4x2_t xi = quant.reconstruct_8_components(code, i); ++ sim.add_8_components(xi); ++ } ++ return sim.result_8(); ++ } ++ ++ float compute_code_distance(const uint8_t* code1, const uint8_t* code2) ++ const { ++ //std::cout << "Naveen: Inside compute_code_distance" << std::flush; ++ Similarity sim(nullptr); ++ sim.begin_8(); ++ for (size_t i = 0; i < quant.d; i += 8) { ++ float32x4x2_t x1 = quant.reconstruct_8_components(code1, i); ++ float32x4x2_t x2 = quant.reconstruct_8_components(code2, i); ++ sim.add_8_components_2(x1, x2); ++ } ++ return sim.result_8(); ++ } ++ void set_query(const float* x) final { ++ q = x; ++ } ++ ++ float symmetric_dis(idx_t i, idx_t j) override { ++ //std::cout << "Naveen: Inside symmetric_dis" << std::flush; ++ return compute_code_distance( ++ codes + i * code_size, codes + j * code_size); ++ } ++ float query_to_code(const uint8_t* code) const final { ++ //std::cout << "Naveen: Inside query_to_code" << std::flush; ++ return compute_distance(q, code); ++ } ++}; ++#endif ++ ++ ++ + /******************************************************************* + * DistanceComputerByte: computes distances in the integer domain + *******************************************************************/ +@@ -985,55 +1427,58 @@ struct DistanceComputerByte : SQDistanceComputer { + * specialization + *******************************************************************/ + +-template ++template + SQDistanceComputer* select_distance_computer( + QuantizerType qtype, + size_t d, + const std::vector& trained) { + constexpr int SIMDWIDTH = Sim::simdwidth; ++ constexpr int SIMDWIDTH_DEFAULT = Sim_default::simdwidth; + switch (qtype) { + case ScalarQuantizer::QT_8bit_uniform: + return new DCTemplate< +- QuantizerTemplate, +- Sim, +- SIMDWIDTH>(d, trained); ++ QuantizerTemplate, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); + + case ScalarQuantizer::QT_4bit_uniform: + return new DCTemplate< +- QuantizerTemplate, +- Sim, +- SIMDWIDTH>(d, trained); ++ QuantizerTemplate, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); + + case ScalarQuantizer::QT_8bit: + return new DCTemplate< +- QuantizerTemplate, +- Sim, +- SIMDWIDTH>(d, trained); ++ QuantizerTemplate, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); + + case ScalarQuantizer::QT_6bit: + return new DCTemplate< +- QuantizerTemplate, +- Sim, +- SIMDWIDTH>(d, trained); +- ++ QuantizerTemplate, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); ++ + case ScalarQuantizer::QT_4bit: + return new DCTemplate< +- QuantizerTemplate, +- Sim, +- SIMDWIDTH>(d, trained); ++ QuantizerTemplate, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); + ++ + case ScalarQuantizer::QT_fp16: ++ //std::cout << "Naveen: Inside QT_fp16 switch case 2" << std::flush; + return new DCTemplate, Sim, SIMDWIDTH>( + d, trained); + + case ScalarQuantizer::QT_8bit_direct: + if (d % 16 == 0) { +- return new DistanceComputerByte(d, trained); ++ return new DistanceComputerByte(d, trained); + } else { + return new DCTemplate< +- Quantizer8bitDirect, +- Sim, +- SIMDWIDTH>(d, trained); ++ Quantizer8bitDirect, ++ Sim_default, ++ SIMDWIDTH_DEFAULT>(d, trained); + } + } + FAISS_THROW_MSG("unknown qtype"); +@@ -1142,18 +1587,34 @@ void ScalarQuantizer::train_residual( + } + + ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { +-#ifdef USE_F16C ++if (d % 8 == 0) { ++ #if defined(USE_F16C) ++ return select_quantizer_1<8,8>(qtype, d, trained); ++ #elif defined(__aarch64__) ++ std::cout << "Naveen: Inside if of select_quantizer 3" << std::flush; ++ return select_quantizer_1<8,1>(qtype, d, trained); ++ #endif ++} else ++{ ++ return select_quantizer_1<1,1>(qtype, d, trained); ++} ++ /* ++#if defined(USE_F16C) || defined(__aarch64__) + if (d % 8 == 0) { ++ //std::cout << "Naveen: Inside if of select_quantizer 3" << std::flush; + return select_quantizer_1<8>(qtype, d, trained); + } else + #endif + { ++ //std::cout << "Naveen: Inside else of select_quantizer 3" << std::flush; + return select_quantizer_1<1>(qtype, d, trained); + } ++ */ + } + + void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) +- const { ++ const { ++ //std::cout << "Naveen: Inside compute_codes 3" << std::flush; + std::unique_ptr squant(select_quantizer()); + + memset(codes, 0, code_size * n); +@@ -1173,9 +1634,37 @@ void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const { + SQDistanceComputer* ScalarQuantizer::get_distance_computer( + MetricType metric) const { + FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); +-#ifdef USE_F16C + if (d % 8 == 0) { +- if (metric == METRIC_L2) { ++ if (metric == METRIC_L2) { ++ #if defined(USE_F16C) ++ return select_distance_computer,SimilarityL2<8>>(qtype, d, trained); ++ #elif defined(__aarch64__) ++ return select_distance_computer,SimilarityL2<1>>(qtype, d, trained); ++ #endif ++ } else { ++ #if defined(USE_F16C) ++ return select_distance_computer,SimilarityIP<8>>(qtype, d, trained); ++ #elif defined(__aarch64__) ++ return select_distance_computer,SimilarityIP<1>>(qtype, d, trained); ++ #endif ++ } ++ } else ++ { ++ if (metric == METRIC_L2) { ++ return select_distance_computer,SimilarityL2<1>>(qtype, d, trained); ++ } else { ++ return select_distance_computer,SimilarityIP<1>>(qtype, d, trained); ++ } ++ } ++} ++ ++ ++ ++/* ++#if defined(USE_F16C) || defined(__aarch64__) ++ if (d % 8 == 0) { ++ //std::cout << "Naveen: Inside if of get_distance_computer3" << std::flush; ++ if (metric == METRIC_L2) { + return select_distance_computer>(qtype, d, trained); + } else { + return select_distance_computer>(qtype, d, trained); +@@ -1183,13 +1672,15 @@ SQDistanceComputer* ScalarQuantizer::get_distance_computer( + } else + #endif + { +- if (metric == METRIC_L2) { ++ //std::cout << "Naveen: Inside else of get_distance_computer3" << std::flush; ++ if (metric == METRIC_L2) { + return select_distance_computer>(qtype, d, trained); + } else { + return select_distance_computer>(qtype, d, trained); + } + } + } ++*/ + + /******************************************************************* + * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object diff --git a/faiss-patches/fp16-fp16.h b/faiss-patches/fp16-fp16.h new file mode 100644 index 000000000..d2988b8a8 --- /dev/null +++ b/faiss-patches/fp16-fp16.h @@ -0,0 +1,33 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +//#include +#include + +namespace faiss { + +inline uint16_t encode_fp16(float x) { + //std::cout << "Naveen: Inside fp16-fp16 encoder" << std::flush; + float32x4_t fx4 = vdupq_n_f32(x); + float16x4_t f16x4 = vcvt_f16_f32(fx4); + uint16x4_t ui16x4 = vreinterpret_u16_f16(f16x4); + return vduph_lane_u16(ui16x4, 3); +} + +inline float decode_fp16(uint16_t x) { + //std::cout << "Naveen: Inside fp16-fp16 decoder" << std::flush; + uint16x4_t ui16x4 = vdup_n_u16(x); + float16x4_t f16x4 = vreinterpret_f16_u16(ui16x4); + float32x4_t fx4 = vcvt_f32_f16(f16x4); + return vdups_laneq_f32(fx4, 3); +} + +} // namespace faiss diff --git a/faiss-patches/fp16.patch b/faiss-patches/fp16.patch new file mode 100644 index 000000000..383ec032a --- /dev/null +++ b/faiss-patches/fp16.patch @@ -0,0 +1,13 @@ +diff --git a/faiss/utils/fp16.h b/faiss/utils/fp16.h +index 90691d8f..f9348580 100644 +--- a/faiss/utils/fp16.h ++++ b/faiss/utils/fp16.h +@@ -13,6 +13,8 @@ + + #if defined(__F16C__) + #include ++#elif defined(__aarch64__) ++#include + #else + #include + #endif diff --git a/jni/src/faiss_wrapper.cpp b/jni/src/faiss_wrapper.cpp index e09edc62c..80e8333ec 100644 --- a/jni/src/faiss_wrapper.cpp +++ b/jni/src/faiss_wrapper.cpp @@ -111,7 +111,7 @@ void knn_jni::faiss_wrapper::CreateIndex(knn_jni::JNIUtilInterface * jniUtil, JN jniUtil->DeleteLocalRef(env, parametersJ); // TODO: Add a condition to set it true for sq encoder based on index description -// indexWriter->is_trained = true; + indexWriter->is_trained = true; // Check that the index does not need to be trained if(!indexWriter->is_trained) { diff --git a/jni/tests/faiss_wrapper_test.cpp b/jni/tests/faiss_wrapper_test.cpp index abe4ecb20..13a63a8e1 100644 --- a/jni/tests/faiss_wrapper_test.cpp +++ b/jni/tests/faiss_wrapper_test.cpp @@ -247,6 +247,90 @@ TEST(FaissInitLibraryTest, BasicAssertions) { knn_jni::faiss_wrapper::InitLibrary(); } +/* +#include "faiss/impl/ScalarQuantizer.h" + +TEST(FaissSQComputeDistanceTest, BasicAssertions) { + + + faiss::idx_t numIds = 2; + std::vector> vectors; + int dim = 8; + for (int64_t i = 0; i < numIds; ++i) { + // ids.push_back(i); + std::cout << "\nVector " << i << ": " << std::flush; + std::vector vect; + vect.reserve(dim); + for (int j = 0; j < dim; ++j) { + float f = test_util::RandomFloat(-500.0, 500.0); + std::cout << f << ", " << std::flush; + vect.push_back(f); + } + vectors.push_back(vect); + } + uint8_t codes; + faiss::ScalarQuantizer * sq = new faiss::ScalarQuantizer(dim, faiss::ScalarQuantizer::QT_fp16); + sq->compute_codes(&vectors[0][0], &codes, numIds); +}*/ + +#include "faiss/impl/ScalarQuantizer.h" +#include +#include +#include +#include +#include +TEST(FaissSQComputeDistanceTest, BasicAssertions) { + faiss::idx_t numIds = 20; + std::vector> vectors; + int dim = 8; + std::ifstream MyReadFile("/home/ec2-user/k-NN/test_vectors.txt"); + + while(!MyReadFile.eof()){ + string text, T; + getline(MyReadFile, text); + stringstream X(text); + std::vector vect; + vect.reserve(dim); + int j = 0; + while (std::getline(X, T, ',')) { + if (j < dim) { + float f = std::stof(T); + vect.push_back(f); + j++; + } + } + vectors.push_back(vect); + } + + std::ifstream MyReadFile1("/home/ec2-user/k-NN/query_vectors.txt"); + std::vector> query_vectors; + while(!MyReadFile1.eof()) { + string text, T; + getline(MyReadFile1, text); + stringstream X(text); + std::vector vect; + vect.reserve(dim); + int j = 0; + while (std::getline(X, T, ',')) { + if (j < dim) { + float f = std::stof(T); + vect.push_back(f); + j++; + } + } + query_vectors.push_back(vect); + } + uint8_t codes; + faiss::ScalarQuantizer * sq = new faiss::ScalarQuantizer(dim, faiss::ScalarQuantizer::QT_fp16); + sq->compute_codes(&vectors[0][0], &codes, numIds); + faiss::ScalarQuantizer::SQDistanceComputer * sqdc = sq -> get_distance_computer(faiss::METRIC_L2); + for(int i=0; i<1; i++) { + sqdc->set_query(&query_vectors[i][0]); + sqdc->query_to_code(&(codes)); + } +} + + TEST(FaissTrainIndexTest, BasicAssertions) { // Define the index configuration int dim = 2; diff --git a/src/main/java/org/opensearch/knn/index/util/Faiss.java b/src/main/java/org/opensearch/knn/index/util/Faiss.java index f0d20d303..a8b46216d 100644 --- a/src/main/java/org/opensearch/knn/index/util/Faiss.java +++ b/src/main/java/org/opensearch/knn/index/util/Faiss.java @@ -30,7 +30,6 @@ import static org.opensearch.knn.common.KNNConstants.FAISS_HNSW_DESCRIPTION; import static org.opensearch.knn.common.KNNConstants.FAISS_IVF_DESCRIPTION; import static org.opensearch.knn.common.KNNConstants.FAISS_PQ_DESCRIPTION; -import static org.opensearch.knn.common.KNNConstants.FAISS_SQFP16_DESCRIPTION; import static org.opensearch.knn.common.KNNConstants.METHOD_ENCODER_PARAMETER; import static org.opensearch.knn.common.KNNConstants.METHOD_HNSW; import static org.opensearch.knn.common.KNNConstants.METHOD_IVF; @@ -78,71 +77,71 @@ class Faiss extends NativeLibrary { ).build()) ) .build(), - KNNConstants.ENCODER_SQFP16, + KNNConstants.ENCODER_SQFP16, MethodComponent.Builder.builder(KNNConstants.ENCODER_SQFP16) - .setRequiresTraining(false) - .setMapGenerator( - ((methodComponent, methodComponentContext) -> MethodAsMapBuilder.builder( - KNNConstants.FAISS_SQFP16_DESCRIPTION, - methodComponent, - methodComponentContext - ).build()) - ) - .build(), -// KNNConstants.ENCODER_SQFP16, -// MethodComponent.Builder.builder(KNNConstants.ENCODER_SQFP16) -// .addParameter( -// ENCODER_PARAMETER_PQ_M, -// new Parameter.IntegerParameter( -// ENCODER_PARAMETER_PQ_M, -// ENCODER_PARAMETER_PQ_CODE_COUNT_DEFAULT, -// v -> v > 0 && v < ENCODER_PARAMETER_PQ_CODE_COUNT_LIMIT -// ) -// ) -// .addParameter( -// ENCODER_PARAMETER_PQ_CODE_SIZE, -// new Parameter.IntegerParameter( -// ENCODER_PARAMETER_PQ_CODE_SIZE, -// ENCODER_PARAMETER_PQ_CODE_SIZE_DEFAULT, -// v -> v > 0 && v < ENCODER_PARAMETER_PQ_CODE_SIZE_LIMIT -// ) -// ) -// .setRequiresTraining(true) -// .setMapGenerator( -// ((methodComponent, methodComponentContext) -> MethodAsMapBuilder.builder( -// FAISS_SQFP16_DESCRIPTION, -// methodComponent, -// methodComponentContext -// ) -// //.addParameter(ENCODER_PARAMETER_PQ_M, "", "").addParameter(ENCODER_PARAMETER_PQ_CODE_SIZE, "", "") -// .build()) -// ) -// .setOverheadInKBEstimator((methodComponent, methodComponentContext, dimension) -> { -// // Size estimate formula: (4 * d * 2^code_size) / 1024 + 1 -// -// // Get value of code size passed in by user -// Object codeSizeObject = methodComponentContext.getParameters().get(ENCODER_PARAMETER_PQ_CODE_SIZE); -// -// // If not specified, get default value of code size -// if (codeSizeObject == null) { -// Parameter codeSizeParameter = methodComponent.getParameters().get(ENCODER_PARAMETER_PQ_CODE_SIZE); -// if (codeSizeParameter == null) { -// throw new IllegalStateException( -// String.format("%s is not a valid parameter. This is a bug.", ENCODER_PARAMETER_PQ_CODE_SIZE) -// ); -// } -// -// codeSizeObject = codeSizeParameter.getDefaultValue(); -// } -// -// if (!(codeSizeObject instanceof Integer)) { -// throw new IllegalStateException(String.format("%s must be an integer.", ENCODER_PARAMETER_PQ_CODE_SIZE)); -// } -// -// int codeSize = (Integer) codeSizeObject; -// return ((4L * (1L << codeSize) * dimension) / BYTES_PER_KILOBYTES) + 1; -// }) -// .build(), + .setRequiresTraining(false) + .setMapGenerator( + ((methodComponent, methodComponentContext) -> MethodAsMapBuilder.builder( + KNNConstants.FAISS_SQFP16_DESCRIPTION, + methodComponent, + methodComponentContext + ).build()) + ) + .build(), + // KNNConstants.ENCODER_SQFP16, + // MethodComponent.Builder.builder(KNNConstants.ENCODER_SQFP16) + // .addParameter( + // ENCODER_PARAMETER_PQ_M, + // new Parameter.IntegerParameter( + // ENCODER_PARAMETER_PQ_M, + // ENCODER_PARAMETER_PQ_CODE_COUNT_DEFAULT, + // v -> v > 0 && v < ENCODER_PARAMETER_PQ_CODE_COUNT_LIMIT + // ) + // ) + // .addParameter( + // ENCODER_PARAMETER_PQ_CODE_SIZE, + // new Parameter.IntegerParameter( + // ENCODER_PARAMETER_PQ_CODE_SIZE, + // ENCODER_PARAMETER_PQ_CODE_SIZE_DEFAULT, + // v -> v > 0 && v < ENCODER_PARAMETER_PQ_CODE_SIZE_LIMIT + // ) + // ) + // .setRequiresTraining(true) + // .setMapGenerator( + // ((methodComponent, methodComponentContext) -> MethodAsMapBuilder.builder( + // FAISS_SQFP16_DESCRIPTION, + // methodComponent, + // methodComponentContext + // ) + // //.addParameter(ENCODER_PARAMETER_PQ_M, "", "").addParameter(ENCODER_PARAMETER_PQ_CODE_SIZE, "", "") + // .build()) + // ) + // .setOverheadInKBEstimator((methodComponent, methodComponentContext, dimension) -> { + // // Size estimate formula: (4 * d * 2^code_size) / 1024 + 1 + // + // // Get value of code size passed in by user + // Object codeSizeObject = methodComponentContext.getParameters().get(ENCODER_PARAMETER_PQ_CODE_SIZE); + // + // // If not specified, get default value of code size + // if (codeSizeObject == null) { + // Parameter codeSizeParameter = methodComponent.getParameters().get(ENCODER_PARAMETER_PQ_CODE_SIZE); + // if (codeSizeParameter == null) { + // throw new IllegalStateException( + // String.format("%s is not a valid parameter. This is a bug.", ENCODER_PARAMETER_PQ_CODE_SIZE) + // ); + // } + // + // codeSizeObject = codeSizeParameter.getDefaultValue(); + // } + // + // if (!(codeSizeObject instanceof Integer)) { + // throw new IllegalStateException(String.format("%s must be an integer.", ENCODER_PARAMETER_PQ_CODE_SIZE)); + // } + // + // int codeSize = (Integer) codeSizeObject; + // return ((4L * (1L << codeSize) * dimension) / BYTES_PER_KILOBYTES) + 1; + // }) + // .build(), KNNConstants.ENCODER_PQ, MethodComponent.Builder.builder(KNNConstants.ENCODER_PQ) .addParameter(