Skip to content

Commit

Permalink
KmerCamel supports k up to 127. (#74)
Browse files Browse the repository at this point in the history
* KmerCamel supports k up to 127.

* Presort: explicit conversion to int64 frmo large k-mers.

* Presort: explicit conversion to size_t everywhere.

* Presort: size_t's changed to uint64_t's.

* Presort: additional macos fixes.
  • Loading branch information
OndrejSladky authored Aug 22, 2024
1 parent 2c96272 commit 13c2127
Show file tree
Hide file tree
Showing 10 changed files with 1,113 additions and 21 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ They come in two different implementations (their results may differ due to the
Note that at this point only the implementations with hash table are optimized and that the Aho-Corasick automaton
based versions of the algorithms are only experimental.

The hashing based implementations of KmerCamel🐫 support $k$-mer with $k$ at most 63,
The hashing based implementations of KmerCamel🐫 support $k$-mer with $k$ at most 127,

All algorithms can be used to either work in the unidirectional model or in the bidirectional model
(i.e. treat $k$-mer and its reverse complement as the same; in this case either of them appears in the result).
Expand Down Expand Up @@ -62,7 +62,7 @@ on macOS.
The program has the following arguments:

- `-p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument.
- `-k value_of_k` - the size of one k-mer (up to 63). This is a required argument.
- `-k value_of_k` - the size of one k-mer (up to 127). This is a required argument.
- `-a algorithm` - the algorithm which should be run. Either `global` or `globalAC` for Global Greedy, `local` or `localAC` for Local Greedy.
The versions with AC use Aho-Corasick automaton. Default `global`.
- `-o output_path` - the path to output file. If not specified, output is printed to stdout.
Expand Down
14 changes: 7 additions & 7 deletions src/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ typedef std::pair<std::vector<size_t>, std::vector<unsigned char>> overlapPath;
template <typename kmer_t>
void PartialPreSort(std::vector<kmer_t> &vals, int k) {
int SORT_FIRST_BITS = std::min(2 * k, SORT_FIRST_BITS_DEFAULT);
kmer_t DIFFERENT_PREFIXES_COUNT = kmer_t(1) << SORT_FIRST_BITS;
uint64_t DIFFERENT_PREFIXES_COUNT = 1ULL << SORT_FIRST_BITS;
kmer_t PREFIX_MASK = DIFFERENT_PREFIXES_COUNT - kmer_t(1);
std::vector<size_t> counts(DIFFERENT_PREFIXES_COUNT, 0);
int shift = (2 * k) - SORT_FIRST_BITS;
kmer_t mask = PREFIX_MASK << shift;
for (auto &&kMer : vals) counts[(kMer & mask) >> shift]++;
for (auto &&kMer : vals) counts[(uint64_t)((kMer & mask) >> shift)]++;
std::vector<std::vector<kmer_t>> distributed(DIFFERENT_PREFIXES_COUNT);
for (kmer_t i = 0; i < DIFFERENT_PREFIXES_COUNT; ++i) distributed[i] = std::vector<kmer_t> (counts[i]);
for (kmer_t i = 0; i < DIFFERENT_PREFIXES_COUNT; ++i) counts[i] = 0;
for (uint64_t i = 0; i < DIFFERENT_PREFIXES_COUNT; ++i) distributed[i] = std::vector<kmer_t> (counts[i]);
for (uint64_t i = 0; i < DIFFERENT_PREFIXES_COUNT; ++i) counts[i] = 0;
for (auto &&kMer : vals) {
kmer_t index = (kMer & mask) >> shift;
uint64_t index = (kMer & mask) >> shift;
distributed[index][counts[index]++] = kMer;
}
size_t index = 0;
Expand Down Expand Up @@ -164,7 +164,7 @@ void SuperstringFromPath(const overlapPath &hamiltonianPath, const std::vector<k
for (; start < kMersCount && !isStart[start]; ++start);

kmer_t last = BitSuffix(access(kMers, start), k-1);
of << letters[BitPrefix(access(kMers, start), k, 1)];
of << letters[(uint64_t)BitPrefix(access(kMers, start), k, 1)];

// Move from the first k-mer to the last which has no successor.
while(edgeFrom[start] != size_t(-1)) {
Expand All @@ -175,7 +175,7 @@ void SuperstringFromPath(const overlapPath &hamiltonianPath, const std::vector<k
of << unmaskedNucleotides;
}
last = BitSuffix(access(kMers, edgeFrom[start]), k-1);
of << letters[BitPrefix(access(kMers, edgeFrom[start]), k, 1)];
of << letters[(uint64_t)BitPrefix(access(kMers, edgeFrom[start]), k, 1)];
start = edgeFrom[start];
}

Expand Down
18 changes: 15 additions & 3 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,28 @@

#define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
#define kh_int128_hash_equal(a, b) ((a) == (b))
#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
#define kh_int256_hash_equal(a, b) ((a) == (b))

#define KHASH_MAP_INIT_INT128(name, khval_t) \
KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT128(name) \
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)

// Use 128-bit integers for k-mers to allow for larger k.
#define KHASH_MAP_INIT_INT256(name, khval_t) \
KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int256_hash_equal)

#define KHASH_SET_INIT_INT256(name) \
KHASH_INIT(name, uint256_t, char, 0, kh_int256_hash_func, kh_int256_hash_equal)

// Use 128-bit integers for extra large k-mers to allow for larger k.
KHASH_SET_INIT_INT256(S256)
KHASH_MAP_INIT_INT256(P256, size_t)
// Use 128-bit integers for large k-mers to allow for larger k.
KHASH_SET_INIT_INT128(S128)
KHASH_MAP_INIT_INT128(P128, size_t)
// Use 64-bits integers for k-mers for faster operations and less memory usage.
// Use 64-bits integers for small k-mers for faster operations and less memory usage.
KHASH_SET_INIT_INT64(S64)
KHASH_MAP_INIT_INT64(P64, size_t)

Expand Down Expand Up @@ -62,6 +73,7 @@ KHASH_MAP_INIT_INT64(P64, size_t)

INIT_KHASH_WRAPPER(64)
INIT_KHASH_WRAPPER(128)
INIT_KHASH_WRAPPER(256)

/// Determine whether the k-mer or its reverse complement is present.
template <typename kmer_t, typename kh_S_t, typename kh_wrapper_t>
Expand Down
15 changes: 12 additions & 3 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
#include <iostream>
#include <cstdint>

#include "uint256_t/uint256_t.h"

#include "ac/kmers_ac.h"

typedef __uint128_t kmer128_t;
typedef uint64_t kmer64_t;
typedef uint256_t kmer256_t;

static const uint8_t nucleotideToInt[] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
Expand Down Expand Up @@ -65,7 +68,6 @@ inline kmer64_t word_reverse_complement(kmer64_t w) {
return ((U)-1) - w;
}


/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer128_t word_reverse_complement(kmer128_t w) {
Expand All @@ -79,6 +81,13 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
return ((U)-1) - w;
}

/// Compute the reverse complement of a word.
inline kmer256_t word_reverse_complement(kmer256_t w) {
kmer128_t low = word_reverse_complement(w.lower());
kmer128_t high = word_reverse_complement(w.upper());
return kmer256_t(low, high);
}

/// Compute the reverse complement of the given k-mer.
template <typename kmer_t>
kmer_t ReverseComplement(kmer_t kMer, int k) {
Expand All @@ -90,7 +99,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
return letters[(uint64_t)((encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3))];
}

/// Convert the encoded KMer representation to string.
Expand All @@ -99,7 +108,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
// The last two bits correspond to one nucleotide.
ret[length - i -1] = letters[encoded & 3];
ret[length - i -1] = letters[(uint64_t)(encoded & 3)];
// Move to the next letter.
encoded >>= 2;
}
Expand Down
13 changes: 8 additions & 5 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int Help() {
std::cerr << "KmerCamel version " << VERSION << std::endl;
std::cerr << "Accepted arguments:" << std::endl;
std::cerr << " -p path_to_fasta - required; valid path to fasta file (can be gziped)" << std::endl;
std::cerr << " -k k_value - required; integer value for k (up to 63)" << std::endl;
std::cerr << " -k k_value - required; integer value for k (up to 127)" << std::endl;
std::cerr << " -a algorithm - the algorithm to be run [global (default), globalAC, local, localAC, streaming]" << std::endl;
std::cerr << " -o output_path - if not specified, the output is printed to stdout" << std::endl;
std::cerr << " -d d_value - integer value for d_max; default 5" << std::endl;
Expand All @@ -31,7 +31,7 @@ int Help() {
std::cerr << "For optimization of masks use `kmercamel optimize`." << std::endl;
std::cerr << "Accepted arguments:" << std::endl;
std::cerr << " -p path_to_fasta - required; valid path to fasta file (can be gziped)" << std::endl;
std::cerr << " -k k_value - required; integer value for k (up to 63)" << std::endl;
std::cerr << " -k k_value - required; integer value for k (up to 127)" << std::endl;
std::cerr << " -a algorithm - the algorithm to be run [ones (default), runs, runsapprox, zeros]" << std::endl;
std::cerr << " -o output_path - if not specified, the output is printed to stdout" << std::endl;
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
Expand All @@ -40,7 +40,7 @@ int Help() {
return 1;
}

constexpr int MAX_K = 63;
constexpr int MAX_K = 127;

void Version() {
std::cerr << VERSION << std::endl;
Expand All @@ -50,7 +50,7 @@ void Version() {
#define INIT_KMERCAMEL(type) \
int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, std::string algorithm, bool optimize_memory) { \
kmer_dict##type##_t wrapper; \
kmer##type##_t kmer_type; \
kmer##type##_t kmer_type = 0; \
if (masks) { \
int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); \
if (ret) Help(); \
Expand Down Expand Up @@ -108,6 +108,7 @@ int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool c

INIT_KMERCAMEL(64)
INIT_KMERCAMEL(128)
INIT_KMERCAMEL(256)

int main(int argc, char **argv) {
std::string path;
Expand Down Expand Up @@ -198,7 +199,9 @@ int main(int argc, char **argv) {
}
if (k < 32) {
return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
} else {
} else if (k < 64) {
return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
} else {
return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
}
}
17 changes: 17 additions & 0 deletions src/uint256_t/uint256_t.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// IMPLEMENTATION BUILD HEADER

// We need uint128_t symbols as plain "extern", neither import nor export
// because we're linking the 128 and 256 object files into a single library
// So we can only have one export for symbol in any translation unit
#define UINT256_T_EXTERN
typedef __uint128_t uint128_t;
#undef UINT256_T_EXTERN

#ifndef _UNIT256_T_BUILD
#define _UINT256_T_BUILD
#include "uint256_t_config.include"
const uint128_t uint128_0(0);
const uint128_t uint128_1(1);
#define UINT256_T_EXTERN _UINT256_T_EXPORT
#endif
#include "uint256_t.include"
Loading

0 comments on commit 13c2127

Please sign in to comment.