Skip to content

Commit

Permalink
Merge pull request #15 from prophyle/large_kmers
Browse files Browse the repository at this point in the history
Support for large k-mers
  • Loading branch information
OndrejSladky authored Mar 21, 2024
2 parents a377262 + 04f69a6 commit b8cc910
Show file tree
Hide file tree
Showing 11 changed files with 425 additions and 351 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

ProphAsm2 is a versatile tool for computing simplitigs/SPSS
from *k-mer sets* and for *k-mer set operations*.
The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm) include a largely speed and memory optimization, parallelization, and support for minimum abundancies.
The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm)
include a largely speed and memory optimization, parallelization,
support for k-mer sizes up to 64 and support for minimum abundances.

Various types of sequencing datasets can be used as the input for
ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads.
Expand All @@ -36,7 +38,7 @@ also set differences.



## Prerequisities
## Prerequisites

* GCC 4.8+ or equivalent
* ZLib
Expand Down Expand Up @@ -90,30 +92,30 @@ def extend_simplitig_forward (K, simplitig):
while extending:
extending = False
q = simplitig[-k+1:]
for x in [‘A’, ‘C’, ‘G’, ‘T’]:
for x in ['A', 'C', 'G', 'T']:
kmer = q + x
if kmer in K:
extending = True
simplitig = simplitig + x
S.remove (kmer)
S.remove (reverse_complement (kmer))
K.remove (kmer)
K.remove (reverse_complement (kmer))
break
return S, s
return K, kmer

def get_maximal_simplitig (K, initial_kmer):
simplitig = initial_kmer
K.remove (initial_kmer)
K.remove (reverse_completement (initial_kmer))
K.remove (reverse_complement (initial_kmer))
K, simplitig = extend_simplitig_forward (K, simplitig)
simplitig = reverse_completent (simplitig)
simplitig = reverse_complement (simplitig)
K, simplitig = extend_simplitig_forward (K, simplitig)
return K, simplitig

def compute_simplitigs (kmers):
K = set()
for kmer in kmers:
K.add (kmer)
K.add (reverse_completement(kmer))
K.add (reverse_complement(kmer))
simplitigs = set()
while |K|>0:
initial_kmer = K.random()
Expand Down
4 changes: 2 additions & 2 deletions scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def main():
print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath)
for complements in [True]:
for m in range(1, 3):
for k in range(2, 33, 7 if args.quick else 1):
for k in range(2, 65, 23 if args.quick else 1):
success &= verify_intersection(args.path, args.interpath, k, complements, m)
print("")

print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path)
for complements in [True, False]:
for m in range(1, 4):
for k in range(2, 33, 3 if args.quick else 1):
for k in range(2, 65, 11 if args.quick else 1):
success &= verify_instance(args.path, k, complements, m)
print("")

Expand Down
151 changes: 79 additions & 72 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,60 +18,78 @@

typedef unsigned char byte;

#ifdef LARGE_KMERS
// Use 128-bit integers for k-mers to allow for larger k.
KHASH_SET_INIT_INT128(S64)
KHASH_MAP_INIT_INT128(P64, size_t)
KHASH_MAP_INIT_INT128(O64, size_t)
#else
// Use 64-bits integers for k-mers for faster operations and less memory usage.
KHASH_MAP_INIT_INT64(S64, byte)
KHASH_MAP_INIT_INT64(P64, size_t)
KHASH_MAP_INIT_INT64(O64, size_t)
#endif
KHASH_MAP_INIT_INT128(S128M, byte)
KHASH_MAP_INIT_INT64(S64M, byte)
KHASH_SET_INIT_INT128(S128S)
KHASH_SET_INIT_INT64(S64S)

byte MINIMUM_ABUNDANCE = 1;

/// Determine whether the canonical k-mer is present.
bool containsKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) {
if (complements) kMer = CanonicalKMer(kMer, k);
bool contains_key = kh_get_S64(kMers, kMer) != kh_end(kMers);
if (MINIMUM_ABUNDANCE == 1) return contains_key;
if (!contains_key) return false;
return kh_val(kMers, kh_get_S64(kMers, kMer)) >= MINIMUM_ABUNDANCE;
}

/// Remove the canonical k-mer from the set.
void eraseKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) {
if (complements) kMer = CanonicalKMer(kMer, k);
auto key = kh_get_S64(kMers, kMer);
if (key != kh_end(kMers)) {
kh_del_S64(kMers, key);
}
}

/// Insert the canonical k-mer into the set.
void insertKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements, bool force = false) {
if (complements) kMer = CanonicalKMer(kMer, k);
int ret;
if (MINIMUM_ABUNDANCE == (byte)1) {
kh_put_S64(kMers, kMer, &ret);
} else {
byte value = 0;
khint_t key;
if (kh_get_S64(kMers, kMer) != kh_end(kMers)) {
key = kh_get_S64(kMers, kMer);
value = kh_val(kMers, key);
} else {
key = kh_put_S64(kMers, kMer, &ret);
}
if (force || value == (byte)255) kh_value(kMers, key) = (byte)255;
else kh_value(kMers, key) = value + 1;
}
}
// Forward definition for the macro.
template <typename KHT>
void differenceInPlace(KHT* kMerSet, KHT* intersection, int k, bool complements);

#define INIT_KHASH_UTILS(type, variant) \
\
/* Data for parallel computation of set differences. */ \
struct DifferenceInPlaceData##variant { \
std::vector<kh_S##variant##_t*> kMerSets; \
kh_S##variant##_t* intersection; \
int k; \
bool complements; \
}; \
\
/* Determine whether the canonical k-mer is present.*/ \
bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
bool contains_key = kh_get_S##variant(kMers, kMer) != kh_end(kMers); \
if (MINIMUM_ABUNDANCE == 1) return contains_key; \
if (!contains_key) return false; \
return kh_val(kMers, kh_get_S##variant(kMers, kMer)) >= MINIMUM_ABUNDANCE; \
} \
\
/* Remove the canonical k-mer from the set.*/ \
void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
auto key = kh_get_S##variant(kMers, kMer); \
if (key != kh_end(kMers)) { \
kh_del_S##variant(kMers, key); \
} \
} \
\
/* Insert the canonical k-mer into the set. */ \
void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force = false) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
int ret; \
if (MINIMUM_ABUNDANCE == (byte)1) { \
kh_put_S##variant(kMers, kMer, &ret); \
} else { \
byte value = 0; \
khint_t key = kh_get_S##variant(kMers, kMer); \
if (key != kh_end(kMers)) { \
value = kh_val(kMers, key); \
} else { \
key = kh_put_S##variant(kMers, kMer, &ret); \
} \
if (force || value == (byte)255) kh_value(kMers, key) = (byte)255; \
else kh_value(kMers, key) = value + 1; \
} \
} \
\
/* Parallel wrapper for differenceInPlace. */ \
void DifferenceInPlaceThread##variant(void *arg, long i, int _) { \
auto data = (DifferenceInPlaceData##variant*)arg; \
differenceInPlace(data->kMerSets[i], data->intersection, data->k, data->complements); \
} \

INIT_KHASH_UTILS(64, 64S)
INIT_KHASH_UTILS(64, 64M)
INIT_KHASH_UTILS(128, 128S)
INIT_KHASH_UTILS(128, 128M)

/// Return the next k-mer in the k-mer set and update the index.
kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex, kmer_t &kMer) {
template <typename KHT, typename kmer_t>
kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) {
if (!kh_exist(kMers, i)) continue;
kMer = kh_key(kMers, i);
Expand All @@ -83,24 +101,24 @@ kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex, kmer_t &kMer) {
return false;
}

/// Construct a vector of the k-mer set in an arbitrary order.
std::vector<kmer_t> kMersToVec(kh_S64_t *kMers) {
std::vector<kmer_t> res(kh_size(kMers));
/// Construct a vector of the k-mer set in an arbitrary order. Only for testing.
std::vector<kmer64_t> kMersToVec(kh_S64M_t *kMers) {
std::vector<kmer64_t> result(kh_size(kMers));
size_t index = 0;
for (auto i = kh_begin(kMers); i != kh_end(kMers); ++i) {
if (!kh_exist(kMers, i)) continue;
if (MINIMUM_ABUNDANCE > 1) if (!containsKMer(kMers, kh_key(kMers, i), 0, false)) continue;
res[index++] = kh_key(kMers, i);
result[index++] = kh_key(kMers, i);
}
res.resize(index);
return res;
result.resize(index);
return result;
}

/// Compute the intersection of several k-mer sets.
kh_S64_t *getIntersection(std::vector<kh_S64_t*> &kMerSets, int k, bool complements) {
kh_S64_t* result = kh_init_S64();
template <typename KHT>
KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool complements) {
if (kMerSets.size() < 2) return result;
kh_S64_t* smallestSet = kMerSets[0];
KHT* smallestSet = kMerSets[0];
for (size_t i = 1; i < kMerSets.size(); ++i) {
if (kh_size(kMerSets[i]) < kh_size(smallestSet)) smallestSet = kMerSets[i];
}
Expand All @@ -123,24 +141,13 @@ kh_S64_t *getIntersection(std::vector<kh_S64_t*> &kMerSets, int k, bool compleme
}

/// Subtract the intersection from each k-mer set.
void differenceInPlace(kh_S64_t* kMerSet, kh_S64_t* intersection, int k, bool complements) {
template <typename KHT>
void differenceInPlace(KHT* kMerSet, KHT* intersection, int k, bool complements) {
for (auto i = kh_begin(intersection); i != kh_end(intersection); ++i) {
if (!kh_exist(intersection, i)) continue;
auto kMer = kh_key(intersection, i);
eraseKMer(kMerSet, kMer, k, complements);
}
}

/// Data for parallel computation of set differences.
struct DifferenceInPlaceData {
std::vector<kh_S64_t*> kMerSets;
kh_S64_t* intersection;
int k;
bool complements;
};

/// Parallel wrapper for differenceInPlace.
void DifferenceInPlaceThread(void *arg, long i, int _) {
auto data = (DifferenceInPlaceData*)arg;
differenceInPlace(data->kMerSets[i], data->intersection, data->k, data->complements);
}

58 changes: 36 additions & 22 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,8 @@
#include <iostream>
#include <cstdint>

#ifdef LARGE_KMERS
typedef __uint128_t kmer_t;
constexpr int KMER_T_SIZE = 128;
#else
typedef uint64_t kmer_t;
constexpr int KMER_T_SIZE = 64;
#endif
typedef uint64_t kmer64_t;
typedef __uint128_t kmer128_t;

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
Expand All @@ -29,11 +24,13 @@ int NucleotideToInt (char c) {
}

/// Compute the prefix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitPrefix(kmer_t kMer, int k, int d) {
return kMer >> ((k - d) << kmer_t(1));
}

/// Compute the suffix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitSuffix(kmer_t kMer, int d) {
return kMer & ((kmer_t(1) << (d << kmer_t(1))) - kmer_t(1));
}
Expand All @@ -53,33 +50,48 @@ struct cmask<U, len, 0> {

/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer_t word_reverse_complement(kmer_t w) {
typedef kmer_t U;
inline kmer64_t word_reverse_complement(kmer64_t w) {
typedef kmer64_t U;
w = ((w >> 2) & cmask<U, 2 >::v) | ((w & cmask<U, 2 >::v) << 2);
w = ((w >> 4) & cmask<U, 4 >::v) | ((w & cmask<U, 4 >::v) << 4);
w = ((w >> 8) & cmask<U, 8 >::v) | ((w & cmask<U, 8 >::v) << 8);
w = ((w >> 16) & cmask<U, 16>::v) | ((w & cmask<U, 16>::v) << 16);
w = ( w >> 32 ) | ( w << 32);
return ((U)-1) - w;
}
/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer128_t word_reverse_complement(kmer128_t w) {
typedef kmer128_t U;
w = ((w >> 2) & cmask<U, 2 >::v) | ((w & cmask<U, 2 >::v) << 2);
w = ((w >> 4) & cmask<U, 4 >::v) | ((w & cmask<U, 4 >::v) << 4);
w = ((w >> 8) & cmask<U, 8 >::v) | ((w & cmask<U, 8 >::v) << 8);
w = ((w >> 16) & cmask<U, 16>::v) | ((w & cmask<U, 16>::v) << 16);
#ifdef LARGE_KMERS
w = ((w >> 32) & cmask<U, 32>::v) | ((w & cmask<U, 32>::v) << 32);
w = ( w >> 64 ) | ( w << 64);
#else
w = ( w >> 32 ) | ( w << 32);
#endif
return ((U)-1) - w;
}

/// Get the mask to mask k-mers.
inline kmer_t MaskForK(int k) {
kmer_t mask = (((kmer_t) 1) << ((k << 1) - 1));
return mask | (mask - 1);
}
constexpr int KMER_SIZE_64 = 64;
constexpr int KMER_SIZE_128 = 128;
#define INIT_KMERS(type) \
\
/* Get the mask to mask k-mers. */ \
inline kmer##type##_t MaskForK##type(int k) { \
kmer##type##_t mask = (((kmer##type##_t) 1) << ((k << 1) - 1)); \
return mask | (mask - 1); \
} \
\
/* Compute the reverse complement of the given k-mer. */ \
inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) { \
return (((kmer##type##_t)word_reverse_complement(kMer)) >> (KMER_SIZE_##type - (k << kmer##type##_t(1)))) & MaskForK##type(k); \
} \

/// Compute the reverse complement of the given k-mer.
inline kmer_t ReverseComplement(kmer_t kMer, int k) {
return (((kmer_t)word_reverse_complement(kMer)) >> (KMER_T_SIZE - (k << kmer_t(1)))) & MaskForK(k);
}
INIT_KMERS(64)
INIT_KMERS(128)

/// Return the lexicographically smaller of the k-mer and its reverse complement.
template <typename kmer_t>
inline kmer_t CanonicalKMer(kmer_t kMer, int k) {
kmer_t rev = ReverseComplement(kMer, k);
return kMer < rev ? kMer : rev;
Expand All @@ -88,11 +100,13 @@ inline kmer_t CanonicalKMer(kmer_t kMer, int k) {
const char letters[4] {'A', 'C', 'G', 'T'};

/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
}

/// Convert the encoded KMer representation to string.
template <typename kmer_t>
std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
Expand Down
Loading

0 comments on commit b8cc910

Please sign in to comment.