Skip to content

Commit

Permalink
Added support for k-mers up to 128.
Browse files Browse the repository at this point in the history
  • Loading branch information
OndrejSladky committed Mar 21, 2024
1 parent bcadb86 commit f35e399
Show file tree
Hide file tree
Showing 11 changed files with 1,486 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ quick-verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa

$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
./create-version.sh
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/uint256_t.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)


prophasmtest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
Expand Down
15 changes: 13 additions & 2 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,29 @@
#include "kmers.h"
#include "khash.h"

typedef unsigned char byte;

#define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
#define kh_int128_hash_equal(a, b) ((a) == (b))
#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
#define kh_int256_hash_equal(a, b) ((a) == (b))

#define KHASH_MAP_INIT_INT128(name, khval_t) \
KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT128(name) \
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)
KHASH_INIT(name, __uint128_t, byte, 0, kh_int128_hash_func, kh_int256_hash_equal)

typedef unsigned char byte;
#define KHASH_MAP_INIT_INT256(name, khval_t) \
KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT256(name) \
KHASH_INIT(name, uint256_t, byte, 0, kh_int256_hash_func, kh_int256_hash_equal)

KHASH_MAP_INIT_INT256(S256M, byte)
KHASH_MAP_INIT_INT128(S128M, byte)
KHASH_MAP_INIT_INT64(S64M, byte)
KHASH_SET_INIT_INT256(S256S)
KHASH_SET_INIT_INT128(S128S)
KHASH_SET_INIT_INT64(S64S)

Expand Down Expand Up @@ -86,6 +95,8 @@ INIT_KHASH_UTILS(64, 64S)
INIT_KHASH_UTILS(64, 64M)
INIT_KHASH_UTILS(128, 128S)
INIT_KHASH_UTILS(128, 128M)
INIT_KHASH_UTILS(256, 256S)
INIT_KHASH_UTILS(256, 256M)

/// Return the next k-mer in the k-mer set and update the index.
template <typename KHT, typename kmer_t>
Expand Down
16 changes: 14 additions & 2 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
#include <iostream>
#include <cstdint>

#include "uint256_t.h"

typedef uint64_t kmer64_t;
typedef __uint128_t kmer128_t;
typedef uint256_t kmer256_t;

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
Expand Down Expand Up @@ -71,9 +74,17 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
w = ( w >> 64 ) | ( w << 64);
return ((U)-1) - w;
}
/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer256_t word_reverse_complement(kmer256_t w) {
kmer128_t low = word_reverse_complement(w.lower());
kmer128_t high = word_reverse_complement(w.upper());
return kmer256_t(high, low);
}

constexpr int KMER_SIZE_64 = 64;
constexpr int KMER_SIZE_128 = 128;
constexpr int KMER_SIZE_256 = 256;
#define INIT_KMERS(type) \
\
/* Get the mask to mask k-mers. */ \
Expand All @@ -89,6 +100,7 @@ inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) {

INIT_KMERS(64)
INIT_KMERS(128)
INIT_KMERS(256)

/// Return the lexicographically smaller of the k-mer and its reverse complement.
template <typename kmer_t>
Expand All @@ -102,7 +114,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
return letters[(unsigned long)(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
}

/// Convert the encoded KMer representation to string.
Expand All @@ -111,7 +123,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
// The last two bits correspond to one nucleotide.
ret[length - i -1] = letters[encoded & 3];
ret[length - i -1] = letters[(unsigned long)encoded & 3];
// Move to the next letter.
encoded >>= 2;
}
Expand Down
12 changes: 10 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "khash_utils.h"


constexpr int MAX_K = 64;
constexpr int MAX_K = 128;


int Help() {
Expand Down Expand Up @@ -200,6 +200,8 @@ INIT_RUN(64, 64S)
INIT_RUN(64, 64M)
INIT_RUN(128, 128S)
INIT_RUN(128, 128M)
INIT_RUN(256, 256S)
INIT_RUN(256, 256M)

int main(int argc, char **argv) {
int32_t k = -1;
Expand Down Expand Up @@ -329,11 +331,17 @@ int main(int argc, char **argv) {
} else {
return run64M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
} else {
} else if (k <= 64) {
if (MINIMUM_ABUNDANCE == (byte)1) {
return run128S(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
} else {
return run128M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
} else {
if (MINIMUM_ABUNDANCE == (byte)1) {
return run256S(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
} else {
return run256M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
}
}
2 changes: 2 additions & 0 deletions src/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,5 @@ INIT_PARSER(64, 64S)
INIT_PARSER(64, 64M)
INIT_PARSER(128, 128S)
INIT_PARSER(128, 128M)
INIT_PARSER(256, 256S)
INIT_PARSER(256, 256M)
6 changes: 4 additions & 2 deletions src/prophasm.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void NextSimplitig(KHT *kMers, kmer_t begin, std::ostream& of, int k, bool comp
} else {
// Extend the simplitig to the right.
eraseKMer(kMers, next, k, complements);
simplitig.emplace_back(letters[ext]);
simplitig.emplace_back(letters[(unsigned int)ext]);
last = next;
}
} else {
Expand All @@ -71,7 +71,7 @@ void NextSimplitig(KHT *kMers, kmer_t begin, std::ostream& of, int k, bool comp
} else {
// Extend the simplitig to the left.
eraseKMer(kMers, next, k, complements);
simplitig.emplace_front(letters[ext]);
simplitig.emplace_front(letters[(unsigned int)ext]);
first = next;
}
}
Expand Down Expand Up @@ -121,3 +121,5 @@ INIT_PROPHASM(64, 64S)
INIT_PROPHASM(64, 64M)
INIT_PROPHASM(128, 128S)
INIT_PROPHASM(128, 128M)
INIT_PROPHASM(256, 256S)
INIT_PROPHASM(256, 256M)
17 changes: 17 additions & 0 deletions src/uint256_t.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// IMPLEMENTATION BUILD HEADER

// We need uint128_t symbols as plain "extern", neither import nor export
// because we're linking the 128 and 256 object files into a single library
// So we can only have one export for symbol in any translation unit
#define UINT256_T_EXTERN
typedef __uint128_t uint128_t;
#undef UINT256_T_EXTERN

#ifndef _UNIT256_T_BUILD
#define _UINT256_T_BUILD
#include "uint256_t_config.include"
const uint128_t uint128_0(0);
const uint128_t uint128_1(1);
#define UINT256_T_EXTERN _UINT256_T_EXPORT
#endif
#include "uint256_t.include"
Loading

0 comments on commit f35e399

Please sign in to comment.