Skip to content

Commit

Permalink
Improve efficiency of sparse queries (#94)
Browse files Browse the repository at this point in the history
* Fix for gcc-13

* Add personalised makefile

* Replace sort with priority queue

* Missing semicolon

* Type fixes for SparseDist

* Add debug flag to makefile

* Debug mode for Makefiles

* Revert "Debug mode for Makefiles"

This reverts commit 266742b.

* Set debug another way

* Allow debug to be turned off

* Put the sort in the right place (after all dists done)

* Replace hash map

* Use C++17

* Correct header name

* Typo in map header name

* Replace class with struct

* Try to fix order in sparse loop

Definitely wrong! Was confused by brace levels

* Change order to match with tests

* Pass C++17 onto nvcc

* Correct c++17 definition for CUDA

* Add license for hashmap

* Rename local build env var
  • Loading branch information
johnlees authored Mar 22, 2024
1 parent d267077 commit adf033f
Show file tree
Hide file tree
Showing 20 changed files with 2,279 additions and 2,134 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.16)
project(pp_sketchlib)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17)

if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
cmake_policy(SET CMP0104 OLD) # Can't get CUDA_ARCHITECTURES to work with NEW
Expand Down
22 changes: 22 additions & 0 deletions LICENSE_unordered_dense
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
MIT License

Copyright (c) 2022 Martin Leitner-Ankerl

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ You can set an environment variable `SKETCHLIB_INSTALL` to affect `python setup.
- Empty: uses cmake
- `conda`: sets library location to the conda environment, and uses `src/Makefile` (used to be used in conda-forge recipe)
- `azure`: Uses `src/Makefile`
- `local`: Uses `src/Makefile_fedora38`

### cmake
Now requires v3.19. If nvcc version is 11.0 or higher, sm8.6 with device link time optimisation will be used.
Expand Down
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def build_extension(self, ext):
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
self.distribution.get_version())


if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)

Expand All @@ -88,6 +89,12 @@ def build_extension(self, ext):
elif target == 'azure':
subprocess.check_call(['make', 'python'], cwd=ext.sourcedir + '/src', env=env)
subprocess.check_call(['make', 'install_python', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env)
elif target == 'local':
debug = "DEBUG="
if cfg == 'Debug':
debug = "DEBUG=1"
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'python', debug], cwd=ext.sourcedir + '/src', env=env)
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'install_python', 'PYTHON_LIB_PATH=' + extdir, debug], cwd=ext.sourcedir + '/src', env=env)
else:
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
Expand Down
5 changes: 3 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
ifdef DEBUG
CXXFLAGS+= -O0 -g
CUDAFLAGS = -g -G
Expand All @@ -7,6 +7,7 @@ else ifdef PROFILE
CUDAFLAGS = -O2 -pg -lineinfo
else
CXXFLAGS+= -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS = -O3
endif

UNAME_S := $(shell uname -s)
Expand All @@ -29,7 +30,7 @@ LDFLAGS+= -L$(LIBLOC)/lib
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)

CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
CUDAFLAGS +=-Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
CUDAFLAGS +=-std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
ifdef GPU
CXXFLAGS += -DGPU_AVAILABLE
CUDAFLAGS += -gencode arch=compute_86,code=sm_86
Expand Down
142 changes: 142 additions & 0 deletions src/Makefile_fedora38
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
CXX=gcc-11
CC=gcc-11
CFLAGS+=-Wall -Wextra -fPIC
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
ifdef DEBUG
CXXFLAGS+= -O0 -g
CUDAFLAGS = -g -G
else ifdef PROFILE
CXXFLAGS+= -O2 -g -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS = -O2 -pg -lineinfo
else
CXXFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS+= -O3
endif

UNAME_S := $(shell uname -s)
LIBLOC = ${CONDA_PREFIX}
LDLIBS = -lz -lhdf5_cpp -lhdf5 -lopenblas -lgomp
ifeq ($(UNAME_S),Linux)
CXXFLAGS+= -m64
ifdef PROFILE
CXXFLAGS+= -Wl,--compress-debug-sections=none
endif
LDLIBS+= -lpthread -lgfortran -lm -ldl -lrt
LDFLAGS=-Wl,-as-needed
endif
ifeq ($(UNAME_S),Darwin)
LDLIBS+= -pthread
endif

CPPFLAGS+=-I"/home/linuxbrew/.linuxbrew/include" -I"." -I"../vendor/highfive/include" -I$(LIBLOC)/include -I$(LIBLOC)/include/eigen3
LDFLAGS+= -L$(LIBLOC)/lib -L"/home/linuxbrew/.linuxbrew/lib" -L/usr/local/cuda-12.3/lib64
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)

CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
CUDAFLAGS +=-ccbin /home/linuxbrew/.linuxbrew/bin/g++-11 -std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
ifdef GPU
CXXFLAGS += -DGPU_AVAILABLE
CUDAFLAGS += -gencode arch=compute_86,code=sm_86
CUDA_LDFLAGS += -L/usr/local/cuda-12.3/lib64
endif

PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix)

# python specific options
python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes)

PROGRAMS=sketch_test matrix_test read_test gpu_dist_test

SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o random/kmeans/KMeansRexCore.o random/kmeans/mersenneTwister2002.o
GPU_SKETCH_OBJS=gpu/gpu_api.o
CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o

# web specific options
web: CXX = em++
# optimised compile options
# NB turn exceptions back on for testing
# NB `--closure 1` can be used to reduce size of js file (this minifies variable names!)
web: CXXFLAGS = -O3 -s ASSERTIONS=1 \
-DNOEXCEPT \
-DJSON_NOEXCEPTION \
-s DISABLE_EXCEPTION_CATCHING=1 \
-fno-exceptions \
-flto --bind -s STRICT=1 \
-s ALLOW_MEMORY_GROWTH=1 \
-s USE_ZLIB=1 \
-s MODULARIZE=1 \
-s "EXPORTED_FUNCTIONS=['_malloc']" \
-s 'EXPORTED_RUNTIME_METHODS=["FS"]' \
-s EXPORT_NAME=WebSketch \
-Wall -Wextra -std=c++14
web: CPPFLAGS += -DWEB_SKETCH
web: LDFLAGS = -lnodefs.js -lworkerfs.js

WEB_OUT=web/web_sketch
WEB_OBJS=${WEB_OUT}.js ${WEB_OUT}.html ${WEB_OUT}.wasm

web: web/web_sketch.o sketch/seqio.o sketch/sketch.o sketch/countmin.o
$(LINK.cpp) $^ -o ${WEB_OUT}.js
sed -i.old '1s;^;\/* eslint-disable *\/;' ${WEB_OUT}.js

all: $(PROGRAMS)

clean:
$(RM) $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) $(WEB_OBJS) *.o *.so version.h ~* $(PROGRAMS)

install: all
install -d $(BINDIR)
install $(PROGRAMS) $(BINDIR)

sketch_test: $(SKETCH_OBJS) test/main.o
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)

matrix_test: $(SKETCH_OBJS) test/matrix_test.o
$(LINK.cpp) $^ -o $@ $(LDLIBS)

read_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/read_test.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)

gpu_dist_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/gpu_dist_test.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)

version.h:
cat sketch/*.cpp sketch/*.hpp gpu/sketch.cu | openssl sha1 | awk '{print "#define SKETCH_VERSION \"" $$2 "\""}' > version.h

database/database.o: version.h

web/web_sketch.o: version.h

python: $(PYTHON_LIB)

$(PYTHON_LIB): $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) sketchlib_bindings.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) -shared $^ device_link.o -o $(PYTHON_LIB) $(CUDA_LDLIBS)

install_python: python
install -d $(PYTHON_LIB_PATH)
install $(PYTHON_LIB) $(PYTHON_LIB_PATH)

gpu/dist.cu.o:
echo ${CUDAFLAGS}
echo ${CPPFLAGS}
echo ${CXXFLAGS}
echo ${CFLAGS}
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/dist.cu -o $@

gpu/sketch.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@

gpu/device_memory.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@

gpu/device_reads.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@

gpu/gpu_countmin.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@

.PHONY: all clean install python install_python web
54 changes: 39 additions & 15 deletions src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <algorithm>
#include <limits>
#include <queue>

#include <H5Cpp.h>
#include <omp.h>
Expand Down Expand Up @@ -314,6 +315,20 @@ void check_sparse_inputs(const std::vector<Reference> &ref_sketches,
}
}

// Struct that allows sorting by dist but also keeping index
struct SparseDist {
float dist;
long j;
};
bool operator<(SparseDist const &a, SparseDist const &b)
{
return a.dist < b.dist;
}
bool operator==(SparseDist const &a, SparseDist const &b)
{
return a.dist == b.dist;
}

sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
const std::vector<size_t> &kmer_lengths,
RandomMC &random_chance, const bool jaccard,
Expand Down Expand Up @@ -344,27 +359,35 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
Eigen::MatrixXf kmer_mat = kmer2mat(kmer_lengths);
#pragma omp parallel for schedule(static) num_threads(num_threads) shared(progress)
for (size_t i = 0; i < ref_sketches.size(); i++) {
std::vector<float> row_dists(ref_sketches.size());
// Use a priority queue to efficiently track the smallest N dists
std::priority_queue<SparseDist> min_dists;
if (!interrupt) {
for (size_t j = 0; j < ref_sketches.size(); j++) {
float row_dist = std::numeric_limits<float>::infinity();
if (i != j) {
if (jaccard) {
// Need 1-J here to sort correctly
row_dists[j] = 1.0f - ref_sketches[i].jaccard_dist(
row_dist = 1.0f - ref_sketches[i].jaccard_dist(
ref_sketches[j], kmer_lengths[dist_col], random_chance);
} else {
float core, acc;
std::tie(core, acc) =
ref_sketches[i].core_acc_dist<RandomMC>(
ref_sketches[j], kmer_mat, random_chance);
if (dist_col == 0) {
row_dists[j] = core;
row_dist = core;
} else {
row_dists[j] = acc;
row_dist = acc;
}
}
} else {
row_dists[j] = std::numeric_limits<float>::infinity();
}
// Add dist if it is in the smallest k
if (min_dists.size() < kNN || row_dist < min_dists.top().dist) {
SparseDist new_min = {row_dist, j};
min_dists.push(new_min);
if (min_dists.size() > kNN) {
min_dists.pop();
}
}
if ((i * ref_sketches.size() + j) % update_every == 0) {
#pragma omp critical
Expand All @@ -376,16 +399,17 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
}
}
}
long offset = i * kNN;
std::vector<long> ordered_dists = sort_indexes(row_dists);
std::fill_n(i_vec.begin() + offset, kNN, i);
// std::copy_n(ordered_dists.begin(), kNN, j_vec.begin() + offset);

for (int k = 0; k < kNN; ++k) {
j_vec[offset + k] = ordered_dists[k];
dists[offset + k] = row_dists[ordered_dists[k]];
}
}

// For each sample/row/i, fill the ijk vectors
// This goes 'backwards' for compatibility with numpy (so dists are ascending)
long offset = i * kNN;
std::fill_n(i_vec.begin() + offset, kNN, i);
for (int k = kNN - 1; k >= 0; --k) {
SparseDist entry = min_dists.top();
j_vec[offset + k] = entry.j;
dists[offset + k] = entry.dist;
min_dists.pop();
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions src/database/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
#include "hdf5_funcs.hpp"
#include "random/random_match.hpp"

#include "robin_hood.h"

// const int deflate_level = 9;

// Helper function prototypes
Expand Down Expand Up @@ -200,9 +198,9 @@ RandomMC Database::load_random(const bool use_rc_default) {
HighFive::Group random_group = _h5_file.getGroup("/random");

// Flattened hashes
robin_hood::unordered_node_map<std::string, uint16_t> cluster_table =
ankerl::unordered_dense::map<std::string, uint16_t> cluster_table =
load_hash<std::string, uint16_t>(random_group, "table");
robin_hood::unordered_node_map<size_t, NumpyMatrix> matches =
ankerl::unordered_dense::map<size_t, NumpyMatrix> matches =
load_hash<size_t, NumpyMatrix>(random_group, "matches");

// Centroid matrix
Expand Down
2 changes: 1 addition & 1 deletion src/database/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cstring>
#include <vector>
#include <string>
#include "robin_hood.h"
#include "unordered_dense.hpp"

#include <highfive/H5File.hpp>

Expand Down
Loading

0 comments on commit adf033f

Please sign in to comment.