Merge pull request #20 from prophyle/optimizations

Optimizations
prophyle · Sep 27, 2024 · f7b520b · f7b520b
2 parents 2523914 + d7b48f2
commit f7b520b
Show file tree

Hide file tree

Showing 18 changed files with 1,287 additions and 191 deletions.
diff --git a/Makefile b/Makefile
@@ -1,9 +1,10 @@
 .PHONY: all clean test cpptest verify quick-verify
 
 CXX=         g++
-CXXFLAGS=    -g -Wall -Wno-unused-function -std=c++17 -O2
+CXXFLAGS=    -g -Wall -Wno-unused-function -std=c++17 -O3
 LDFLAGS=     -lz -lpthread
 SRC=         src
+UINT256=     $(SRC)/uint256_t
 SCRIPTS=     scripts
 DATA=        data
 TESTS=       tests
@@ -24,13 +25,13 @@ verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
 quick-verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
 	python $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa --quick --interpath $(DATA)/spyogenes.fa
 
-$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
+$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h $(wildcard $(UINT256)/*.cpp $(UINT256)/*.h $(UINT256)/*.include)
 	./create-version.sh
 	$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)
 
 
 prophasmtest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
-	$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp  gtest-all.o -pthread -o $@ $(LDFLAGS)
 
 gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
 	$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ ProphAsm2 is a versatile tool for computing simplitigs/SPSS
 from *k-mer sets* and for *k-mer set operations*.
 The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm)
 include a largely speed and memory optimization, parallelization,
-support for k-mer sizes up to 64 and support for minimum abundances.
+support for k-mer sizes up to 128 and support for minimum abundances.
 
 Various types of sequencing datasets can be used as the input for
 ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads.
@@ -74,8 +74,19 @@ Set operations:
 USAGE-BEGIN
 -->
 ```
-
-
+Usage:    prophasm2 [options]
+Command-line parameters:
+ -k INT   K-mer size.
+ -i FILE  Input FASTA file (can be used multiple times).
+ -o FILE  Output FASTA file (if used, must be used as many times as -i).
+ -x FILE  Compute intersection, subtract it, save it.
+ -s FILE  Output file with k-mer statistics.
+ -t INT   Number of threads (default 1).
+ -m INT   Minimum abundance of k-mers to appear in the assembly (default 1).
+ -S       Silent mode.
+ -u       Do not consider k-mer and its reverse complement as equivalent.
+
+Note that '-' can be used for standard input/output. 
 ```
 <!---
 USAGE-END

diff --git a/scripts/verify.py b/scripts/verify.py
@@ -102,14 +102,14 @@ def main():
         print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath)
         for complements in [True]:
             for m in range(1, 3):
-                for k in range(2, 65, 23 if args.quick else 1):
+                for k in range(2, 129, 23 if args.quick else 1):
                     success &= verify_intersection(args.path, args.interpath, k, complements, m)
                 print("")
 
     print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path)
     for complements in [True, False]:
         for m in range(1, 4):
-            for k in range(2, 65, 11 if args.quick else 1):
+            for k in range(2, 129, 11 if args.quick else 1):
                 success &= verify_instance(args.path, k, complements, m)
             print("")
 

diff --git a/src/khash.h b/src/khash.h
@@ -159,7 +159,7 @@ typedef unsigned long long khint64_t;
 #endif
 #endif /* klib_unused */
 
-typedef khint32_t khint_t;
+typedef khint64_t khint_t;
 typedef khint_t khiter_t;
 
 #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
@@ -176,6 +176,10 @@ typedef khint_t khiter_t;
 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 #endif
 
+#ifndef kroundup64
+#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
+#endif
+
 #ifndef kcalloc
 #define kcalloc(N,Z) calloc(N,Z)
 #endif
@@ -246,7 +250,7 @@ static const double __ac_HASH_UPPER = 0.77;
 		khint32_t *new_flags = 0;										\
 		khint_t j = 1;													\
 		{																\
-			kroundup32(new_n_buckets); 									\
+			kroundup64(new_n_buckets); 									\
 			if (new_n_buckets < 4) new_n_buckets = 4;					\
 			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
 			else { /* hash table size to be changed (shrink or expand); rehash */ \
@@ -382,7 +386,8 @@ static const double __ac_HASH_UPPER = 0.77;
   @param  key   The integer [khint64_t]
   @return       The hash value [khint_t]
  */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+//#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+#define kh_int64_hash_func(key) (khint64_t)(__ac_Wang_hash(key))
 /*! @function
   @abstract     64-bit integer comparison function
  */

diff --git a/src/khash_utils.h b/src/khash_utils.h
@@ -6,21 +6,22 @@
 #include "kmers.h"
 #include "khash.h"
 
+typedef unsigned char byte;
 
 #define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
 #define kh_int128_hash_equal(a, b) ((a) == (b))
+#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
+#define kh_int256_hash_equal(a, b) ((a) == (b))
 
 #define KHASH_MAP_INIT_INT128(name, khval_t)								\
 	KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)
 
-#define KHASH_SET_INIT_INT128(name)										\
-	KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)
-
-typedef unsigned char byte;
+#define KHASH_MAP_INIT_INT256(name, khval_t)								\
+	KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int128_hash_equal)
 
+KHASH_MAP_INIT_INT256(S256M, byte)
 KHASH_MAP_INIT_INT128(S128M, byte)
 KHASH_MAP_INIT_INT64(S64M, byte)
-KHASH_SET_INIT_INT128(S128S)
 KHASH_SET_INIT_INT64(S64S)
 
 byte MINIMUM_ABUNDANCE = 1;
@@ -40,26 +41,35 @@ struct DifferenceInPlaceData##variant {
 };                                                                                                                  \
                                                                                                                     \
 /* Determine whether the canonical k-mer is present.*/                                                              \
-bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) {                         \
-    if (complements) kMer = CanonicalKMer(kMer, k);                                                                 \
+inline bool containsCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) {                                  \
     bool contains_key = kh_get_S##variant(kMers, kMer) != kh_end(kMers);                                            \
     if (MINIMUM_ABUNDANCE == 1) return contains_key;                                                                \
     if (!contains_key) return false;                                                                                \
     return kh_val(kMers, kh_get_S##variant(kMers, kMer)) >= MINIMUM_ABUNDANCE;                                      \
 }                                                                                                                   \
                                                                                                                     \
-/* Remove the canonical k-mer from the set.*/                                                                       \
-void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) {                            \
+/* Determine whether the canonical form of a k-mer is present.*/                                                    \
+inline bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) {                  \
     if (complements) kMer = CanonicalKMer(kMer, k);                                                                 \
+    return containsCanonicalKMer(kMers, kMer);                                                                      \
+}                                                                                                                   \
+                                                                                                                    \
+/* Remove the canonical k-mer from the set.*/                                                                       \
+inline void eraseCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) {                                     \
     auto key = kh_get_S##variant(kMers, kMer);                                                                      \
     if (key != kh_end(kMers)) {                                                                                     \
         kh_del_S##variant(kMers, key);                                                                              \
     }                                                                                                               \
 }                                                                                                                   \
                                                                                                                     \
-/* Insert the canonical k-mer into the set. */                                                                      \
-void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force = false) {       \
+/* Remove the canonical form of a k-mer from the set.*/                                                             \
+inline void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) {                     \
     if (complements) kMer = CanonicalKMer(kMer, k);                                                                 \
+    eraseCanonicalKMer(kMers, kMer);                                                                                \
+}                                                                                                                   \
+                                                                                                                    \
+/* Insert the canonical k-mer into the set. */                                                                      \
+inline void insertCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, bool force=false) {                  \
     int ret;                                                                                                        \
     if (MINIMUM_ABUNDANCE == (byte)1) {                                                                             \
         kh_put_S##variant(kMers, kMer, &ret);                                                                       \
@@ -76,6 +86,12 @@ void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool compl
     }                                                                                                               \
 }                                                                                                                   \
                                                                                                                     \
+/* Insert the canonical k-mer into the set. */                                                                      \
+inline void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force=false) {  \
+    if (complements) kMer = CanonicalKMer(kMer, k);                                                                 \
+    insertCanonicalKMer(kMers, kMer, force);                                                                        \
+}                                                                                                                   \
+                                                                                                                    \
 /* Parallel wrapper for differenceInPlace. */                                                                       \
 void DifferenceInPlaceThread##variant(void *arg, long i, int _) {                                                   \
     auto data = (DifferenceInPlaceData##variant*)arg;                                                               \
@@ -84,12 +100,12 @@ void DifferenceInPlaceThread##variant(void *arg, long i, int _) {
 
 INIT_KHASH_UTILS(64, 64S)
 INIT_KHASH_UTILS(64, 64M)
-INIT_KHASH_UTILS(128, 128S)
 INIT_KHASH_UTILS(128, 128M)
+INIT_KHASH_UTILS(256, 256M)
 
 /// Return the next k-mer in the k-mer set and update the index.
 template <typename KHT, typename kmer_t>
-kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
+inline kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
     for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) {
         if (!kh_exist(kMers, i)) continue;
         kMer = kh_key(kMers, i);
@@ -116,7 +132,7 @@ std::vector<kmer64_t> kMersToVec(kh_S64M_t *kMers) {
 
 /// Compute the intersection of several k-mer sets.
 template <typename KHT>
-KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool complements) {
+KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets) {
     if (kMerSets.size() < 2) return result;
     KHT* smallestSet = kMerSets[0];
     for (size_t i = 1; i < kMerSets.size(); ++i) {
@@ -125,16 +141,16 @@ KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool compl
     for (auto i = kh_begin(smallestSet); i != kh_end(smallestSet); ++i) {
         if (!kh_exist(smallestSet, i)) continue;
         auto kMer = kh_key(smallestSet, i);
-        if (!containsKMer(smallestSet, kMer, k, complements)) continue;
+        if (!containsCanonicalKMer(smallestSet, kMer)) continue;
         bool everywhere = true;
         for (size_t i = 0; i < kMerSets.size(); ++i) if (kMerSets[i] != smallestSet) {
-            if (!containsKMer(kMerSets[i], kMer, k, complements)) {
+            if (!containsCanonicalKMer(kMerSets[i], kMer)) {
                 everywhere = false;
                 break;
             }
         }
         if (everywhere) {
-            insertKMer(result, kMer, k, complements, true);
+            insertCanonicalKMer(result, kMer, true);
         }
     }
     return result;

diff --git a/src/kmers.h b/src/kmers.h
@@ -4,12 +4,15 @@
 #include <iostream>
 #include <cstdint>
 
+#include "uint256_t/uint256_t.h"
+
 typedef uint64_t kmer64_t;
 typedef __uint128_t kmer128_t;
+typedef uint256_t kmer256_t;
 
 /// Convert the given basic nucleotide to int so it can be used for indexing in AC.
 /// If non-existing nucleotide is given, return -1.
-int NucleotideToInt (char c) {
+inline int NucleotideToInt (char c) {
     switch (c) {
         case 'A': return 0;
         case 'C': return 1;
@@ -25,13 +28,13 @@ int NucleotideToInt (char c) {
 
 /// Compute the prefix of size d of the given k-mer.
 template <typename kmer_t>
-kmer_t BitPrefix(kmer_t kMer, int k, int d) {
+inline kmer_t BitPrefix(kmer_t kMer, int k, int d) {
     return kMer >> ((k - d) << kmer_t(1));
 }
 
 /// Compute the suffix of size d of the given k-mer.
 template <typename kmer_t>
-kmer_t BitSuffix(kmer_t kMer, int d) {
+inline kmer_t BitSuffix(kmer_t kMer, int d) {
     return kMer & ((kmer_t(1) << (d << kmer_t(1))) - kmer_t(1));
 }
 
@@ -71,9 +74,17 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
     w = ( w >> 64                   ) | ( w                    << 64);
     return ((U)-1) - w;
 }
+/// Compute the reverse complement of a word.
+/// Copyright: Jellyfish GPL-3.0
+inline kmer256_t word_reverse_complement(kmer256_t w) {
+    kmer128_t low = word_reverse_complement(w.lower());
+    kmer128_t high = word_reverse_complement(w.upper());
+    return kmer256_t(low, high);
+}
 
 constexpr int KMER_SIZE_64 = 64;
 constexpr int KMER_SIZE_128 = 128;
+constexpr int KMER_SIZE_256 = 256;
 #define INIT_KMERS(type)                                                                                                              \
                                                                                                                                       \
 /* Get the mask to mask k-mers. */                                                                                                    \
@@ -89,6 +100,7 @@ inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) {
 
 INIT_KMERS(64)
 INIT_KMERS(128)
+INIT_KMERS(256)
 
 /// Return the lexicographically smaller of the k-mer and its reverse complement.
 template <typename kmer_t>
@@ -102,7 +114,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
 /// Return the index-th nucleotide from the encoded k-mer.
 template <typename kmer_t>
 inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
-    return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
+    return letters[(uint32_t)(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
 }
 
 /// Convert the encoded KMer representation to string.
@@ -111,7 +123,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
     std::string ret(length, 'N');
     for (int i = 0; i < length; ++i) {
         // The last two bits correspond to one nucleotide.
-        ret[length - i -1] = letters[encoded & 3];
+        ret[length - i -1] = letters[(uint32_t)(encoded & 3)];
         // Move to the next letter.
         encoded >>= 2;
     }