Skip to content

Commit

Permalink
Merge pull request facebookresearch#3 from iotamudelta/wf32
Browse files Browse the repository at this point in the history
Add FAISS_HIP_WF32 configure option.
  • Loading branch information
ItsPitt authored Nov 9, 2023
2 parents 8ba1772 + 3a64602 commit 3d42863
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 13 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ list(APPEND CMAKE_MODULE_PATH "/opt/rocm/hsa/lib")
option(FAISS_OPT_LEVEL "" "generic")
option(FAISS_ENABLE_GPU "Enable support for GPU indexes." OFF)
option(FAISS_ENABLE_HIP "Enable support for HIP-based GPU indexes." ON)
option(FAISS_HIP_WF32 "Enable wf32 for HIP (default wf64)." OFF)
option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
option(FAISS_ENABLE_C_API "Build C API." OFF)
Expand All @@ -81,6 +82,9 @@ if(FAISS_ENABLE_HIP)
find_package(hip REQUIRED)
find_package(hsa-runtime64 REQUIRED)
find_package(hipblas REQUIRED)
if (FAISS_HIP_WF32)
add_compile_definitions("HIP_WF32")
endif()
endif()

if(FAISS_ENABLE_RAFT)
Expand Down
15 changes: 14 additions & 1 deletion faiss/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,29 @@ function(generate_ivf_interleaved_code)
)

# Used for SUB_THREADS, SUB_NUM_WARP_Q, SUB_NUM_THREAD_Q
if(FAISS_HIP_WF32)
set(THREADS_AND_WARPS
"128|1024|8"
"128|1|1"
"128|128|3"
"128|256|4"
"128|32|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64
"128|32|2"
"128|512|8"
"128|64|3"
"64|2048|8"
)
else()
set(THREADS_AND_WARPS
"128|1024|8"
"128|1|1"
"128|128|3"
"128|256|4"
"128|64|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64
"128|512|8"
"128|64|3"
"64|2048|8"
)
endif()

# Traverse through the Cartesian product of X and Y
foreach(sub_codec ${SUB_CODEC_TYPE})
Expand Down
43 changes: 41 additions & 2 deletions faiss/hip/impl/IVFUtilsSelect1.hip
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void runPass1SelectLists(
#if GPU_MAX_SELECTION_K >= 2048

// block size 128 for k <= 1024, 64 for k = 2048
#define RUN_PASS_DIR(INDEX_T, DIR) \
#ifdef HIP_WF32
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
Expand All @@ -154,8 +154,29 @@ void runPass1SelectLists(
} while (0)

#else

#define RUN_PASS_DIR(INDEX_T, DIR) \
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
RUN_PASS(INDEX_T, 128, 128, 3, DIR); \
} else if (k <= 256) { \
RUN_PASS(INDEX_T, 128, 256, 4, DIR); \
} else if (k <= 512) { \
RUN_PASS(INDEX_T, 128, 512, 8, DIR); \
} else if (k <= 1024) { \
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} else if (k <= 2048) { \
RUN_PASS(INDEX_T, 64, 2048, 8, DIR); \
} \
} while (0)
#endif

#else

#ifdef HIP_WF32
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
Expand All @@ -173,6 +194,24 @@ void runPass1SelectLists(
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} \
} while (0)
#else
#define RUN_PASS_DIR(INDEX_T, DIR) \
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
RUN_PASS(INDEX_T, 128, 128, 3, DIR); \
} else if (k <= 256) { \
RUN_PASS(INDEX_T, 128, 256, 4, DIR); \
} else if (k <= 512) { \
RUN_PASS(INDEX_T, 128, 512, 8, DIR); \
} else if (k <= 1024) { \
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} \
} while (0)
#endif

#endif // GPU_MAX_SELECTION_K

Expand Down
43 changes: 41 additions & 2 deletions faiss/hip/impl/IVFUtilsSelect2.hip
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ void runPass2SelectLists(
#if GPU_MAX_SELECTION_K >= 2048

// block size 128 for k <= 1024, 64 for k = 2048
#define RUN_PASS_DIR(INDEX_T, DIR) \

#ifdef HIP_WF32
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
Expand All @@ -210,10 +211,30 @@ void runPass2SelectLists(
RUN_PASS(INDEX_T, 64, 2048, 8, DIR); \
} \
} while (0)
#else
#define RUN_PASS_DIR(INDEX_T, DIR) \
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
RUN_PASS(INDEX_T, 128, 128, 3, DIR); \
} else if (k <= 256) { \
RUN_PASS(INDEX_T, 128, 256, 4, DIR); \
} else if (k <= 512) { \
RUN_PASS(INDEX_T, 128, 512, 8, DIR); \
} else if (k <= 1024) { \
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} else if (k <= 2048) { \
RUN_PASS(INDEX_T, 64, 2048, 8, DIR); \
} \
} while (0)
#endif

#else

#define RUN_PASS_DIR(INDEX_T, DIR) \
#ifdef HIP_WF32
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
Expand All @@ -231,6 +252,24 @@ void runPass2SelectLists(
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} \
} while (0)
#else
#define RUN_PASS_DIR(INDEX_T, DIR) \
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
RUN_PASS(INDEX_T, 128, 128, 3, DIR); \
} else if (k <= 256) { \
RUN_PASS(INDEX_T, 128, 256, 4, DIR); \
} else if (k <= 512) { \
RUN_PASS(INDEX_T, 128, 512, 8, DIR); \
} else if (k <= 1024) { \
RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
} \
} while (0)
#endif

#endif // GPU_MAX_SELECTION_K

Expand Down
8 changes: 7 additions & 1 deletion faiss/hip/utils/DeviceDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ namespace faiss {
namespace hip {

// We validate this against the actual architecture in device initialization
constexpr int kWarpSize = 32;//__AMDGCN_WAVEFRONT_SIZE; // either = 32 or = 64 (Defined in hip_runtime.h)
#ifdef HIP_WF32
constexpr int kWarpSize = 32; // either = 32 or = 64 (Defined in hip_runtime.h)
#else
constexpr int kWarpSize = 64;
#endif

// This is a memory barrier for intra-warp writes to shared memory.
__forceinline__ __device__ void warpFence() {
Expand All @@ -24,11 +28,13 @@ __forceinline__ __device__ void warpFence() {
}

#if CUDA_VERSION > 9000
#warning("CUDA > 9000, somehow")
// Based on the CUDA version (we assume what version of nvcc/ptxas we were
// compiled with), the register allocation algorithm is much better, so only
// enable the 2048 selection code if we are above 9.0 (9.2 seems to be ok)
#define GPU_MAX_SELECTION_K 2048
#else
#warning("CUDA small")
#define GPU_MAX_SELECTION_K 1024
#endif

Expand Down
6 changes: 2 additions & 4 deletions faiss/hip/utils/MergeNetworkWarp.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,10 +535,8 @@ struct BitonicSortStep<K, V, 1, Dir, Comp> {
warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
#if !(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
//TODO needs to be fixed
//#warning(including wider merge)
// warpBitonicMergeLE16<K, V, 32, Dir, Comp, false>(k[0], v[0]);
#ifndef HIP_WF32
warpBitonicMergeLE16<K, V, 32, Dir, Comp, false>(k[0], v[0]);
#endif
}
};
Expand Down
6 changes: 5 additions & 1 deletion faiss/hip/utils/PtxUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ getBitfield(uint64_t val, int pos, int len) {
}

__device__ __forceinline__ int getLaneId() {
return threadIdx.x & (32 - 1); //(__AMDGCN_WAVEFRONT_SIZE - 1);
#ifdef HIP_WF32
return threadIdx.x & (32 - 1);
#else
return threadIdx.x & (64 - 1);
#endif
}

} // namespace hip
Expand Down
4 changes: 2 additions & 2 deletions faiss/hip/utils/WarpSelectFloat.hip
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void runWarpSelect(
if (dir) {
if (k == 1) {
WARP_SELECT_CALL(float, true, 1);
#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
#ifdef HIP_WF32
} else if (k <= 32) {
WARP_SELECT_CALL(float, true, 32);
#endif
Expand All @@ -77,7 +77,7 @@ void runWarpSelect(
} else {
if (k == 1) {
WARP_SELECT_CALL(float, false, 1);
#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
#ifdef HIP_WF32
} else if (k <= 32) {
WARP_SELECT_CALL(float, false, 32);
#endif
Expand Down

0 comments on commit 3d42863

Please sign in to comment.