Skip to content

Commit

Permalink
Merge pull request erikbern#5 from DBGroup-SUSTech/jiexu
Browse files Browse the repository at this point in the history
Add nsg
  • Loading branch information
BenjaminXiang authored Jul 9, 2024
2 parents 8fef6cc + c0983ea commit b3b1e3b
Show file tree
Hide file tree
Showing 34 changed files with 3,095 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
dataset: [random-xs-20-angular]
library:
- annoy
- nsg
- balltree
- bruteforce
- ckdtree
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.DS_Store
*.pyc
*.o
*.graph
protocol/c/fr-*

install/*.txt
Expand Down
4 changes: 4 additions & 0 deletions ann_benchmarks/algorithms/nsg/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
build/*
.cache/*
*.graph
*.nsg
26 changes: 26 additions & 0 deletions ann_benchmarks/algorithms/nsg/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 2.8)

project(efanna2e)
include_directories(${PROJECT_SOURCE_DIR}/include)
#OpenMP
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(FATAL_ERROR "no OpenMP supprot")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fPIC -fopenmp")
add_definitions (-std=c++17 -O3 -lboost -march=native -Wall -DINFO)

add_subdirectory(src)
add_subdirectory(tests)

set(LIBRARY_NAME hybrid_search)

# 添加源文件
set(SOURCES tests/baseline.cpp)

# 编译生成共享库
add_library(${LIBRARY_NAME} SHARED ${SOURCES})
target_link_libraries(hybrid_search ${PROJECT_NAME})
1 change: 1 addition & 0 deletions ann_benchmarks/algorithms/nsg/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FROM ann-benchmarks
20 changes: 20 additions & 0 deletions ann_benchmarks/algorithms/nsg/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
float:
any:
- base_args: ['@metric']
constructor: Nsg
disabled: false
docker_tag: ann-benchmarks-nsg
module: ann_benchmarks.algorithms.nsg
name: nsg
run_groups:
nsg:
args:
KNN_K: [200]
KNN_L: [200]
KNN_iter: [10]
KNN_S: [10]
KNN_R: [100]
NSG_L: [40]
NSG_R: [50]
NSG_C: [500]
query_args: [[10, 20, 30, 40, 60, 80, 120, 200, 400, 600, 800]]
328 changes: 328 additions & 0 deletions ann_benchmarks/algorithms/nsg/include/efanna2e/distance.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
//
// Created by 付聪 on 2017/6/21.
//

#ifndef EFANNA2E_DISTANCE_H
#define EFANNA2E_DISTANCE_H

#include <x86intrin.h>
#include <iostream>
namespace efanna2e{
enum Metric{
L2 = 0,
INNER_PRODUCT = 1,
FAST_L2 = 2,
PQ = 3
};
class Distance {
public:
virtual float compare(const float* a, const float* b, unsigned length) const = 0;
virtual ~Distance() {}
};

class DistanceL2 : public Distance{
public:
float compare(const float* a, const float* b, unsigned size) const {
float result = 0;

#ifdef __GNUC__
#ifdef __AVX__

#define AVX_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm256_loadu_ps(addr1);\
tmp2 = _mm256_loadu_ps(addr2);\
tmp1 = _mm256_sub_ps(tmp1, tmp2); \
tmp1 = _mm256_mul_ps(tmp1, tmp1); \
dest = _mm256_add_ps(dest, tmp1);

__m256 sum;
__m256 l0, l1;
__m256 r0, r1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};

sum = _mm256_loadu_ps(unpack);
if(DR){AVX_L2SQR(e_l, e_r, sum, l0, r0);}

for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
AVX_L2SQR(l, r, sum, l0, r0);
AVX_L2SQR(l + 8, r + 8, sum, l1, r1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];

#else
#ifdef __SSE2__
#define SSE_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm_load_ps(addr1);\
tmp2 = _mm_load_ps(addr2);\
tmp1 = _mm_sub_ps(tmp1, tmp2); \
tmp1 = _mm_mul_ps(tmp1, tmp1); \
dest = _mm_add_ps(dest, tmp1);

__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_L2SQR(e_l+8, e_r+8, sum, l2, r2);
case 8:
SSE_L2SQR(e_l+4, e_r+4, sum, l1, r1);
case 4:
SSE_L2SQR(e_l, e_r, sum, l0, r0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
SSE_L2SQR(l, r, sum, l0, r0);
SSE_L2SQR(l + 4, r + 4, sum, l1, r1);
SSE_L2SQR(l + 8, r + 8, sum, l2, r2);
SSE_L2SQR(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];

//nomal distance
#else

float diff0, diff1, diff2, diff3;
const float* last = a + size;
const float* unroll_group = last - 3;

/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
diff0 = a[0] - b[0];
diff1 = a[1] - b[1];
diff2 = a[2] - b[2];
diff3 = a[3] - b[3];
result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
a += 4;
b += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
diff0 = *a++ - *b++;
result += diff0 * diff0;
}
#endif
#endif
#endif

return result;
}
};

class DistanceInnerProduct : public Distance{
public:
float compare(const float* a, const float* b, unsigned size) const {
float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm256_loadu_ps(addr1);\
tmp2 = _mm256_loadu_ps(addr2);\
tmp1 = _mm256_mul_ps(tmp1, tmp2); \
dest = _mm256_add_ps(dest, tmp1);

__m256 sum;
__m256 l0, l1;
__m256 r0, r1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};

sum = _mm256_loadu_ps(unpack);
if(DR){AVX_DOT(e_l, e_r, sum, l0, r0);}

for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
AVX_DOT(l, r, sum, l0, r0);
AVX_DOT(l + 8, r + 8, sum, l1, r1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];

#else
#ifdef __SSE2__
#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm128_loadu_ps(addr1);\
tmp2 = _mm128_loadu_ps(addr2);\
tmp1 = _mm128_mul_ps(tmp1, tmp2); \
dest = _mm128_add_ps(dest, tmp1);
__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_DOT(e_l+8, e_r+8, sum, l2, r2);
case 8:
SSE_DOT(e_l+4, e_r+4, sum, l1, r1);
case 4:
SSE_DOT(e_l, e_r, sum, l0, r0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
SSE_DOT(l, r, sum, l0, r0);
SSE_DOT(l + 4, r + 4, sum, l1, r1);
SSE_DOT(l + 8, r + 8, sum, l2, r2);
SSE_DOT(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else

float dot0, dot1, dot2, dot3;
const float* last = a + size;
const float* unroll_group = last - 3;

/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * b[0];
dot1 = a[1] * b[1];
dot2 = a[2] * b[2];
dot3 = a[3] * b[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
b += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += *a++ * *b++;
}
#endif
#endif
#endif
return result;
}

};
class DistanceFastL2 : public DistanceInnerProduct{
public:
float norm(const float* a, unsigned size) const{
float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_L2NORM(addr, dest, tmp) \
tmp = _mm256_loadu_ps(addr); \
tmp = _mm256_mul_ps(tmp, tmp); \
dest = _mm256_add_ps(dest, tmp);

__m256 sum;
__m256 l0, l1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *e_l = l + DD;
float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0};

sum = _mm256_loadu_ps(unpack);
if(DR){AVX_L2NORM(e_l, sum, l0);}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
AVX_L2NORM(l, sum, l0);
AVX_L2NORM(l + 8, sum, l1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
#else
#ifdef __SSE2__
#define SSE_L2NORM(addr, dest, tmp) \
tmp = _mm128_loadu_ps(addr); \
tmp = _mm128_mul_ps(tmp, tmp); \
dest = _mm128_add_ps(dest, tmp);

__m128 sum;
__m128 l0, l1, l2, l3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *e_l = l + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_L2NORM(e_l+8, sum, l2);
case 8:
SSE_L2NORM(e_l+4, sum, l1);
case 4:
SSE_L2NORM(e_l, sum, l0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
SSE_L2NORM(l, sum, l0);
SSE_L2NORM(l + 4, sum, l1);
SSE_L2NORM(l + 8, sum, l2);
SSE_L2NORM(l + 12, sum, l3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else
float dot0, dot1, dot2, dot3;
const float* last = a + size;
const float* unroll_group = last - 3;

/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * a[0];
dot1 = a[1] * a[1];
dot2 = a[2] * a[2];
dot3 = a[3] * a[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += (*a) * (*a);
a++;
}
#endif
#endif
#endif
return result;
}
using DistanceInnerProduct::compare;
float compare(const float* a, const float* b, float norm, unsigned size) const {//not implement
float result = -2 * DistanceInnerProduct::compare(a, b, size);
result += norm;
return result;
}
};
}



#endif //EFANNA2E_DISTANCE_H
Loading

0 comments on commit b3b1e3b

Please sign in to comment.