Merge pull request erikbern#5 from DBGroup-SUSTech/jiexu

Add nsg
BenjaminXiang · Jul 9, 2024 · b3b1e3b · b3b1e3b
2 parents 8fef6cc + c0983ea
commit b3b1e3b
Show file tree

Hide file tree

Showing 34 changed files with 3,095 additions and 0 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -30,6 +30,7 @@ jobs:
         dataset: [random-xs-20-angular]
         library:
           - annoy
+          - nsg
           - balltree
           - bruteforce
           - ckdtree

diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 *.pyc
 *.o
+*.graph
 protocol/c/fr-*
 
 install/*.txt

diff --git a/ann_benchmarks/algorithms/nsg/.gitignore b/ann_benchmarks/algorithms/nsg/.gitignore
@@ -0,0 +1,4 @@
+build/*
+.cache/*
+*.graph
+*.nsg
diff --git a/ann_benchmarks/algorithms/nsg/CMakeLists.txt b/ann_benchmarks/algorithms/nsg/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8)
+
+project(efanna2e)
+include_directories(${PROJECT_SOURCE_DIR}/include)
+#OpenMP
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+else()
+    message(FATAL_ERROR "no OpenMP supprot")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fPIC -fopenmp")
+add_definitions (-std=c++17 -O3 -lboost -march=native -Wall -DINFO)
+
+add_subdirectory(src)
+add_subdirectory(tests)
+
+set(LIBRARY_NAME hybrid_search)
+
+# 添加源文件
+set(SOURCES tests/baseline.cpp)
+
+# 编译生成共享库
+add_library(${LIBRARY_NAME} SHARED ${SOURCES})
+target_link_libraries(hybrid_search ${PROJECT_NAME})
diff --git a/ann_benchmarks/algorithms/nsg/Dockerfile b/ann_benchmarks/algorithms/nsg/Dockerfile
@@ -0,0 +1 @@
+FROM ann-benchmarks
diff --git a/ann_benchmarks/algorithms/nsg/config.yml b/ann_benchmarks/algorithms/nsg/config.yml
@@ -0,0 +1,20 @@
+float:
+  any:
+  - base_args: ['@metric']
+    constructor: Nsg
+    disabled: false
+    docker_tag: ann-benchmarks-nsg
+    module: ann_benchmarks.algorithms.nsg
+    name: nsg
+    run_groups:
+      nsg:
+        args: 
+          KNN_K: [200]
+          KNN_L: [200]
+          KNN_iter: [10]
+          KNN_S: [10]
+          KNN_R: [100]
+          NSG_L: [40]
+          NSG_R: [50]
+          NSG_C: [500]
+        query_args: [[10, 20, 30, 40, 60, 80, 120, 200, 400, 600, 800]]
diff --git a/ann_benchmarks/algorithms/nsg/include/efanna2e/distance.h b/ann_benchmarks/algorithms/nsg/include/efanna2e/distance.h
@@ -0,0 +1,328 @@
+//
+// Created by 付聪 on 2017/6/21.
+//
+
+#ifndef EFANNA2E_DISTANCE_H
+#define EFANNA2E_DISTANCE_H
+
+#include <x86intrin.h>
+#include <iostream>
+namespace efanna2e{
+  enum Metric{
+    L2 = 0,
+    INNER_PRODUCT = 1,
+    FAST_L2 = 2,
+    PQ = 3
+  };
+    class Distance {
+    public:
+        virtual float compare(const float* a, const float* b, unsigned length) const = 0;
+        virtual ~Distance() {}
+    };
+
+    class DistanceL2 : public Distance{
+    public:
+        float compare(const float* a, const float* b, unsigned size) const {
+            float result = 0;
+
+#ifdef __GNUC__
+#ifdef __AVX__
+
+  #define AVX_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
+      tmp1 = _mm256_loadu_ps(addr1);\
+      tmp2 = _mm256_loadu_ps(addr2);\
+      tmp1 = _mm256_sub_ps(tmp1, tmp2); \
+      tmp1 = _mm256_mul_ps(tmp1, tmp1); \
+      dest = _mm256_add_ps(dest, tmp1);
+
+      __m256 sum;
+      __m256 l0, l1;
+      __m256 r0, r1;
+      unsigned D = (size + 7) & ~7U;
+      unsigned DR = D % 16;
+      unsigned DD = D - DR;
+      const float *l = a;
+      const float *r = b;
+      const float *e_l = l + DD;
+      const float *e_r = r + DD;
+      float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+
+      sum = _mm256_loadu_ps(unpack);
+      if(DR){AVX_L2SQR(e_l, e_r, sum, l0, r0);}
+
+      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+      	AVX_L2SQR(l, r, sum, l0, r0);
+      	AVX_L2SQR(l + 8, r + 8, sum, l1, r1);
+      }
+      _mm256_storeu_ps(unpack, sum);
+      result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
+
+#else
+#ifdef __SSE2__
+  #define SSE_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
+          tmp1 = _mm_load_ps(addr1);\
+          tmp2 = _mm_load_ps(addr2);\
+          tmp1 = _mm_sub_ps(tmp1, tmp2); \
+          tmp1 = _mm_mul_ps(tmp1, tmp1); \
+          dest = _mm_add_ps(dest, tmp1);
+
+  __m128 sum;
+  __m128 l0, l1, l2, l3;
+  __m128 r0, r1, r2, r3;
+  unsigned D = (size + 3) & ~3U;
+  unsigned DR = D % 16;
+  unsigned DD = D - DR;
+  const float *l = a;
+  const float *r = b;
+  const float *e_l = l + DD;
+  const float *e_r = r + DD;
+  float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};
+
+  sum = _mm_load_ps(unpack);
+  switch (DR) {
+      case 12:
+      SSE_L2SQR(e_l+8, e_r+8, sum, l2, r2);
+      case 8:
+      SSE_L2SQR(e_l+4, e_r+4, sum, l1, r1);
+      case 4:
+      SSE_L2SQR(e_l, e_r, sum, l0, r0);
+    default:
+      break;
+  }
+  for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+      SSE_L2SQR(l, r, sum, l0, r0);
+      SSE_L2SQR(l + 4, r + 4, sum, l1, r1);
+      SSE_L2SQR(l + 8, r + 8, sum, l2, r2);
+      SSE_L2SQR(l + 12, r + 12, sum, l3, r3);
+  }
+  _mm_storeu_ps(unpack, sum);
+  result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
+
+//nomal distance
+#else
+
+      float diff0, diff1, diff2, diff3;
+      const float* last = a + size;
+      const float* unroll_group = last - 3;
+
+      /* Process 4 items with each loop for efficiency. */
+      while (a < unroll_group) {
+          diff0 = a[0] - b[0];
+          diff1 = a[1] - b[1];
+          diff2 = a[2] - b[2];
+          diff3 = a[3] - b[3];
+          result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+          a += 4;
+          b += 4;
+      }
+      /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+      while (a < last) {
+          diff0 = *a++ - *b++;
+          result += diff0 * diff0;
+      }
+#endif
+#endif
+#endif
+
+            return result;
+        }
+    };
+
+  class DistanceInnerProduct : public Distance{
+  public:
+    float compare(const float* a, const float* b, unsigned size) const {
+      float result = 0;
+#ifdef __GNUC__
+#ifdef __AVX__
+      #define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
+          tmp1 = _mm256_loadu_ps(addr1);\
+          tmp2 = _mm256_loadu_ps(addr2);\
+          tmp1 = _mm256_mul_ps(tmp1, tmp2); \
+          dest = _mm256_add_ps(dest, tmp1);
+
+	  __m256 sum;
+   	  __m256 l0, l1;
+   	  __m256 r0, r1;
+      unsigned D = (size + 7) & ~7U;
+      unsigned DR = D % 16;
+      unsigned DD = D - DR;
+   	  const float *l = a;
+   	  const float *r = b;
+      const float *e_l = l + DD;
+   	  const float *e_r = r + DD;
+      float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+
+      sum = _mm256_loadu_ps(unpack);
+      if(DR){AVX_DOT(e_l, e_r, sum, l0, r0);}
+
+      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+	    AVX_DOT(l, r, sum, l0, r0);
+	    AVX_DOT(l + 8, r + 8, sum, l1, r1);
+      }
+      _mm256_storeu_ps(unpack, sum);
+      result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
+
+#else
+#ifdef __SSE2__
+      #define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
+          tmp1 = _mm128_loadu_ps(addr1);\
+          tmp2 = _mm128_loadu_ps(addr2);\
+          tmp1 = _mm128_mul_ps(tmp1, tmp2); \
+          dest = _mm128_add_ps(dest, tmp1);
+      __m128 sum;
+      __m128 l0, l1, l2, l3;
+      __m128 r0, r1, r2, r3;
+      unsigned D = (size + 3) & ~3U;
+      unsigned DR = D % 16;
+      unsigned DD = D - DR;
+      const float *l = a;
+      const float *r = b;
+      const float *e_l = l + DD;
+      const float *e_r = r + DD;
+      float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};
+
+      sum = _mm_load_ps(unpack);
+      switch (DR) {
+          case 12:
+          SSE_DOT(e_l+8, e_r+8, sum, l2, r2);
+          case 8:
+          SSE_DOT(e_l+4, e_r+4, sum, l1, r1);
+          case 4:
+          SSE_DOT(e_l, e_r, sum, l0, r0);
+        default:
+          break;
+      }
+      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+          SSE_DOT(l, r, sum, l0, r0);
+          SSE_DOT(l + 4, r + 4, sum, l1, r1);
+          SSE_DOT(l + 8, r + 8, sum, l2, r2);
+          SSE_DOT(l + 12, r + 12, sum, l3, r3);
+      }
+      _mm_storeu_ps(unpack, sum);
+      result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
+#else
+
+      float dot0, dot1, dot2, dot3;
+      const float* last = a + size;
+      const float* unroll_group = last - 3;
+
+      /* Process 4 items with each loop for efficiency. */
+      while (a < unroll_group) {
+          dot0 = a[0] * b[0];
+          dot1 = a[1] * b[1];
+          dot2 = a[2] * b[2];
+          dot3 = a[3] * b[3];
+          result += dot0 + dot1 + dot2 + dot3;
+          a += 4;
+          b += 4;
+      }
+      /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+      while (a < last) {
+          result += *a++ * *b++;
+      }
+#endif
+#endif
+#endif
+      return result;
+    }
+
+  };
+  class DistanceFastL2 : public DistanceInnerProduct{
+   public:
+    float norm(const float* a, unsigned size) const{
+      float result = 0;
+#ifdef __GNUC__
+#ifdef __AVX__
+#define AVX_L2NORM(addr, dest, tmp) \
+    tmp = _mm256_loadu_ps(addr); \
+    tmp = _mm256_mul_ps(tmp, tmp); \
+    dest = _mm256_add_ps(dest, tmp);
+
+    __m256 sum;
+   	__m256 l0, l1;
+    unsigned D = (size + 7) & ~7U;
+    unsigned DR = D % 16;
+    unsigned DD = D - DR;
+    const float *l = a;
+    const float *e_l = l + DD;
+    float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    sum = _mm256_loadu_ps(unpack);
+    if(DR){AVX_L2NORM(e_l, sum, l0);}
+	for (unsigned i = 0; i < DD; i += 16, l += 16) {
+      AVX_L2NORM(l, sum, l0);
+      AVX_L2NORM(l + 8, sum, l1);
+    }
+    _mm256_storeu_ps(unpack, sum);
+    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
+#else
+#ifdef __SSE2__
+#define SSE_L2NORM(addr, dest, tmp) \
+    tmp = _mm128_loadu_ps(addr); \
+    tmp = _mm128_mul_ps(tmp, tmp); \
+    dest = _mm128_add_ps(dest, tmp);
+
+    __m128 sum;
+    __m128 l0, l1, l2, l3;
+    unsigned D = (size + 3) & ~3U;
+    unsigned DR = D % 16;
+    unsigned DD = D - DR;
+    const float *l = a;
+    const float *e_l = l + DD;
+    float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};
+
+    sum = _mm_load_ps(unpack);
+    switch (DR) {
+        case 12:
+        SSE_L2NORM(e_l+8, sum, l2);
+        case 8:
+        SSE_L2NORM(e_l+4, sum, l1);
+        case 4:
+        SSE_L2NORM(e_l, sum, l0);
+      default:
+        break;
+    }
+    for (unsigned i = 0; i < DD; i += 16, l += 16) {
+        SSE_L2NORM(l, sum, l0);
+        SSE_L2NORM(l + 4, sum, l1);
+        SSE_L2NORM(l + 8, sum, l2);
+        SSE_L2NORM(l + 12, sum, l3);
+    }
+    _mm_storeu_ps(unpack, sum);
+    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
+#else
+    float dot0, dot1, dot2, dot3;
+    const float* last = a + size;
+    const float* unroll_group = last - 3;
+
+    /* Process 4 items with each loop for efficiency. */
+    while (a < unroll_group) {
+        dot0 = a[0] * a[0];
+        dot1 = a[1] * a[1];
+        dot2 = a[2] * a[2];
+        dot3 = a[3] * a[3];
+        result += dot0 + dot1 + dot2 + dot3;
+        a += 4;
+    }
+    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+    while (a < last) {
+        result += (*a) * (*a);
+        a++;
+    }
+#endif
+#endif
+#endif
+      return result;
+    }
+    using DistanceInnerProduct::compare;
+    float compare(const float* a, const float* b, float norm, unsigned size) const {//not implement
+      float result = -2 * DistanceInnerProduct::compare(a, b, size);
+      result += norm;
+      return result;
+    }
+  };
+}
+
+
+
+#endif //EFANNA2E_DISTANCE_H