Improve efficiency of sparse queries (#94)

* Fix for gcc-13 * Add personalised makefile * Replace sort with priority queue * Missing semicolon * Type fixes for SparseDist * Add debug flag to makefile * Debug mode for Makefiles * Revert "Debug mode for Makefiles" This reverts commit 266742b. * Set debug another way * Allow debug to be turned off * Put the sort in the right place (after all dists done) * Replace hash map * Use C++17 * Correct header name * Typo in map header name * Replace class with struct * Try to fix order in sparse loop Definitely wrong! Was confused by brace levels * Change order to match with tests * Pass C++17 onto nvcc * Correct c++17 definition for CUDA * Add license for hashmap * Rename local build env var
bacpop · Mar 22, 2024 · adf033f · adf033f
1 parent d267077
commit adf033f
Show file tree

Hide file tree

Showing 20 changed files with 2,279 additions and 2,134 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.16)
 project(pp_sketchlib)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 
 if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
   cmake_policy(SET CMP0104 OLD) # Can't get CUDA_ARCHITECTURES to work with NEW

diff --git a/LICENSE_unordered_dense b/LICENSE_unordered_dense
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2022 Martin Leitner-Ankerl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/README.md b/README.md
@@ -334,6 +334,7 @@ You can set an environment variable `SKETCHLIB_INSTALL` to affect `python setup.
 - Empty: uses cmake
 - `conda`: sets library location to the conda environment, and uses `src/Makefile` (used to be used in conda-forge recipe)
 - `azure`: Uses `src/Makefile`
+- `local`: Uses `src/Makefile_fedora38`
 
 ### cmake
 Now requires v3.19. If nvcc version is 11.0 or higher, sm8.6 with device link time optimisation will be used.

diff --git a/setup.py b/setup.py
@@ -78,6 +78,7 @@ def build_extension(self, ext):
         env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
                                                               self.distribution.get_version())
 
+
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
 
@@ -88,6 +89,12 @@ def build_extension(self, ext):
         elif target == 'azure':
             subprocess.check_call(['make', 'python'], cwd=ext.sourcedir + '/src', env=env)
             subprocess.check_call(['make', 'install_python', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env)
+        elif target == 'local':
+            debug = "DEBUG="
+            if cfg == 'Debug':
+                debug = "DEBUG=1"
+            subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'python', debug], cwd=ext.sourcedir + '/src', env=env)
+            subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'install_python', 'PYTHON_LIB_PATH=' + extdir, debug], cwd=ext.sourcedir + '/src', env=env)
         else:
             subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
             subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)

diff --git a/src/Makefile b/src/Makefile
@@ -1,4 +1,4 @@
-CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
+CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
 ifdef DEBUG
   CXXFLAGS+= -O0 -g
   CUDAFLAGS = -g -G
@@ -7,6 +7,7 @@ else ifdef PROFILE
   CUDAFLAGS = -O2 -pg -lineinfo
 else
   CXXFLAGS+= -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
+  CUDAFLAGS = -O3
 endif
 
 UNAME_S := $(shell uname -s)
@@ -29,7 +30,7 @@ LDFLAGS+= -L$(LIBLOC)/lib
 CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)
 
 CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
-CUDAFLAGS +=-Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
+CUDAFLAGS +=-std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
 ifdef GPU
 	CXXFLAGS += -DGPU_AVAILABLE
 	CUDAFLAGS += -gencode arch=compute_86,code=sm_86

diff --git a/src/Makefile_fedora38 b/src/Makefile_fedora38
@@ -0,0 +1,142 @@
+CXX=gcc-11
+CC=gcc-11
+CFLAGS+=-Wall -Wextra -fPIC
+CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
+ifdef DEBUG
+  CXXFLAGS+= -O0 -g
+  CUDAFLAGS = -g -G
+else ifdef PROFILE
+  CXXFLAGS+= -O2 -g -flto -fno-fat-lto-objects -fvisibility=hidden
+  CUDAFLAGS = -O2 -pg -lineinfo
+else
+  CXXFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
+  CFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
+  CUDAFLAGS+= -O3
+endif
+
+UNAME_S := $(shell uname -s)
+LIBLOC = ${CONDA_PREFIX}
+LDLIBS = -lz -lhdf5_cpp -lhdf5 -lopenblas -lgomp
+ifeq ($(UNAME_S),Linux)
+	CXXFLAGS+= -m64
+	ifdef PROFILE
+		CXXFLAGS+= -Wl,--compress-debug-sections=none
+	endif
+	LDLIBS+= -lpthread -lgfortran -lm -ldl -lrt
+	LDFLAGS=-Wl,-as-needed
+endif
+ifeq ($(UNAME_S),Darwin)
+	LDLIBS+= -pthread
+endif
+
+CPPFLAGS+=-I"/home/linuxbrew/.linuxbrew/include" -I"." -I"../vendor/highfive/include" -I$(LIBLOC)/include -I$(LIBLOC)/include/eigen3
+LDFLAGS+= -L$(LIBLOC)/lib -L"/home/linuxbrew/.linuxbrew/lib" -L/usr/local/cuda-12.3/lib64
+CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)
+
+CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
+CUDAFLAGS +=-ccbin /home/linuxbrew/.linuxbrew/bin/g++-11 -std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
+ifdef GPU
+	CXXFLAGS += -DGPU_AVAILABLE
+	CUDAFLAGS += -gencode arch=compute_86,code=sm_86
+	CUDA_LDFLAGS += -L/usr/local/cuda-12.3/lib64
+endif
+
+PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix)
+
+# python specific options
+python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes)
+
+PROGRAMS=sketch_test matrix_test read_test gpu_dist_test
+
+SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o random/kmeans/KMeansRexCore.o random/kmeans/mersenneTwister2002.o
+GPU_SKETCH_OBJS=gpu/gpu_api.o
+CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o
+
+# web specific options
+web: CXX = em++
+# optimised compile options
+# NB turn exceptions back on for testing
+# NB `--closure 1` can be used to reduce size of js file (this minifies variable names!)
+web: CXXFLAGS = -O3 -s ASSERTIONS=1  \
+				-DNOEXCEPT \
+				-DJSON_NOEXCEPTION \
+				-s DISABLE_EXCEPTION_CATCHING=1 \
+				-fno-exceptions \
+				-flto --bind -s STRICT=1 \
+				-s ALLOW_MEMORY_GROWTH=1 \
+				-s USE_ZLIB=1 \
+				-s MODULARIZE=1 \
+				-s "EXPORTED_FUNCTIONS=['_malloc']" \
+				-s 'EXPORTED_RUNTIME_METHODS=["FS"]' \
+				-s EXPORT_NAME=WebSketch \
+				-Wall -Wextra -std=c++14
+web: CPPFLAGS += -DWEB_SKETCH
+web: LDFLAGS = -lnodefs.js -lworkerfs.js
+
+WEB_OUT=web/web_sketch
+WEB_OBJS=${WEB_OUT}.js ${WEB_OUT}.html ${WEB_OUT}.wasm
+
+web: web/web_sketch.o sketch/seqio.o sketch/sketch.o sketch/countmin.o
+	$(LINK.cpp) $^ -o ${WEB_OUT}.js
+	sed -i.old '1s;^;\/* eslint-disable *\/;' ${WEB_OUT}.js
+
+all: $(PROGRAMS)
+
+clean:
+	$(RM) $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) $(WEB_OBJS) *.o *.so version.h ~* $(PROGRAMS)
+
+install: all
+	install -d $(BINDIR)
+	install $(PROGRAMS) $(BINDIR)
+
+sketch_test: $(SKETCH_OBJS) test/main.o
+	$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
+
+matrix_test: $(SKETCH_OBJS) test/matrix_test.o
+	$(LINK.cpp) $^ -o $@ $(LDLIBS)
+
+read_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/read_test.o
+	nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
+	$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)
+
+gpu_dist_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/gpu_dist_test.o
+	nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
+	$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)
+
+version.h:
+	cat sketch/*.cpp sketch/*.hpp gpu/sketch.cu | openssl sha1 | awk '{print "#define SKETCH_VERSION \"" $$2 "\""}' > version.h
+
+database/database.o: version.h
+
+web/web_sketch.o: version.h
+
+python: $(PYTHON_LIB)
+
+$(PYTHON_LIB): $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) sketchlib_bindings.o
+	nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
+	$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) -shared $^ device_link.o -o $(PYTHON_LIB) $(CUDA_LDLIBS)
+
+install_python: python
+	install -d $(PYTHON_LIB_PATH)
+	install $(PYTHON_LIB) $(PYTHON_LIB_PATH)
+
+gpu/dist.cu.o:
+	echo ${CUDAFLAGS}
+	echo ${CPPFLAGS}
+	echo ${CXXFLAGS}
+	echo ${CFLAGS}
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/dist.cu -o $@
+
+gpu/sketch.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@
+
+gpu/device_memory.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@
+
+gpu/device_reads.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@
+
+gpu/gpu_countmin.cu.o:
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@
+
+.PHONY: all clean install python install_python web
diff --git a/src/api.cpp b/src/api.cpp
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <queue>
 
 #include <H5Cpp.h>
 #include <omp.h>
@@ -314,6 +315,20 @@ void check_sparse_inputs(const std::vector<Reference> &ref_sketches,
   }
 }
 
+// Struct that allows sorting by dist but also keeping index
+struct SparseDist {
+  float dist;
+  long j;
+};
+bool operator<(SparseDist const &a, SparseDist const &b)
+{
+  return a.dist < b.dist;
+}
+bool operator==(SparseDist const &a, SparseDist const &b)
+{
+  return a.dist == b.dist;
+}
+
 sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
                      const std::vector<size_t> &kmer_lengths,
                      RandomMC &random_chance, const bool jaccard,
@@ -344,27 +359,35 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
   Eigen::MatrixXf kmer_mat = kmer2mat(kmer_lengths);
 #pragma omp parallel for schedule(static) num_threads(num_threads) shared(progress)
   for (size_t i = 0; i < ref_sketches.size(); i++) {
-    std::vector<float> row_dists(ref_sketches.size());
+    // Use a priority queue to efficiently track the smallest N dists
+    std::priority_queue<SparseDist> min_dists;
     if (!interrupt) {
       for (size_t j = 0; j < ref_sketches.size(); j++) {
+        float row_dist = std::numeric_limits<float>::infinity();
         if (i != j) {
           if (jaccard) {
             // Need 1-J here to sort correctly
-            row_dists[j] = 1.0f - ref_sketches[i].jaccard_dist(
+            row_dist = 1.0f - ref_sketches[i].jaccard_dist(
                 ref_sketches[j], kmer_lengths[dist_col], random_chance);
           } else {
             float core, acc;
             std::tie(core, acc) =
                 ref_sketches[i].core_acc_dist<RandomMC>(
                     ref_sketches[j], kmer_mat, random_chance);
             if (dist_col == 0) {
-              row_dists[j] = core;
+              row_dist = core;
             } else {
-              row_dists[j] = acc;
+              row_dist = acc;
             }
           }
-        } else {
-          row_dists[j] = std::numeric_limits<float>::infinity();
+        }
+        // Add dist if it is in the smallest k
+        if (min_dists.size() < kNN || row_dist < min_dists.top().dist) {
+          SparseDist new_min = {row_dist, j};
+          min_dists.push(new_min);
+          if (min_dists.size() > kNN) {
+            min_dists.pop();
+          }
         }
         if ((i * ref_sketches.size() + j) % update_every == 0) {
 #pragma omp critical
@@ -376,16 +399,17 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
             }
           }
         }
-        long offset = i * kNN;
-        std::vector<long> ordered_dists = sort_indexes(row_dists);
-        std::fill_n(i_vec.begin() + offset, kNN, i);
-        // std::copy_n(ordered_dists.begin(), kNN, j_vec.begin() + offset);
-
-        for (int k = 0; k < kNN; ++k) {
-          j_vec[offset + k] = ordered_dists[k];
-          dists[offset + k] = row_dists[ordered_dists[k]];
-        }
+      }
 
+      // For each sample/row/i, fill the ijk vectors
+      // This goes 'backwards' for compatibility with numpy (so dists are ascending)
+      long offset = i * kNN;
+      std::fill_n(i_vec.begin() + offset, kNN, i);
+      for (int k = kNN - 1; k >= 0; --k) {
+        SparseDist entry = min_dists.top();
+        j_vec[offset + k] = entry.j;
+        dists[offset + k] = entry.dist;
+        min_dists.pop();
       }
     }
   }

diff --git a/src/database/database.cpp b/src/database/database.cpp
@@ -12,8 +12,6 @@
 #include "hdf5_funcs.hpp"
 #include "random/random_match.hpp"
 
-#include "robin_hood.h"
-
 // const int deflate_level = 9;
 
 // Helper function prototypes
@@ -200,9 +198,9 @@ RandomMC Database::load_random(const bool use_rc_default) {
     HighFive::Group random_group = _h5_file.getGroup("/random");
 
     // Flattened hashes
-    robin_hood::unordered_node_map<std::string, uint16_t> cluster_table =
+    ankerl::unordered_dense::map<std::string, uint16_t> cluster_table =
         load_hash<std::string, uint16_t>(random_group, "table");
-    robin_hood::unordered_node_map<size_t, NumpyMatrix> matches =
+    ankerl::unordered_dense::map<size_t, NumpyMatrix> matches =
         load_hash<size_t, NumpyMatrix>(random_group, "matches");
 
     // Centroid matrix

diff --git a/src/database/database.hpp b/src/database/database.hpp
@@ -12,7 +12,7 @@
 #include <cstring>
 #include <vector>
 #include <string>
-#include "robin_hood.h"
+#include "unordered_dense.hpp"
 
 #include <highfive/H5File.hpp>