diff --git a/README.md b/README.md index 6555833..73fdd41 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,9 @@ ProphAsm2 is a versatile tool for computing simplitigs/SPSS from *k-mer sets* and for *k-mer set operations*. -The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm) include a largely speed and memory optimization, parallelization, and support for minimum abundancies. +The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm) +include a largely speed and memory optimization, parallelization, +support for k-mer sizes up to 64 and support for minimum abundances. Various types of sequencing datasets can be used as the input for ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads. @@ -36,7 +38,7 @@ also set differences. -## Prerequisities +## Prerequisites * GCC 4.8+ or equivalent * ZLib @@ -90,22 +92,22 @@ def extend_simplitig_forward (K, simplitig): while extending: extending = False q = simplitig[-k+1:] - for x in [‘A’, ‘C’, ‘G’, ‘T’]: + for x in ['A', 'C', 'G', 'T']: kmer = q + x if kmer in K: extending = True simplitig = simplitig + x - S.remove (kmer) - S.remove (reverse_complement (kmer)) + K.remove (kmer) + K.remove (reverse_complement (kmer)) break - return S, s + return K, kmer def get_maximal_simplitig (K, initial_kmer): simplitig = initial_kmer K.remove (initial_kmer) - K.remove (reverse_completement (initial_kmer)) + K.remove (reverse_complement (initial_kmer)) K, simplitig = extend_simplitig_forward (K, simplitig) - simplitig = reverse_completent (simplitig) + simplitig = reverse_complement (simplitig) K, simplitig = extend_simplitig_forward (K, simplitig) return K, simplitig @@ -113,7 +115,7 @@ def compute_simplitigs (kmers): K = set() for kmer in kmers: K.add (kmer) - K.add (reverse_completement(kmer)) + K.add (reverse_complement(kmer)) simplitigs = set() while |K|>0: initial_kmer = K.random() diff --git a/scripts/verify.py b/scripts/verify.py index c621480..e96abbd 100755 --- a/scripts/verify.py +++ b/scripts/verify.py @@ -102,14 +102,14 @@ def main(): print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath) for complements in [True]: for m in range(1, 3): - for k in range(2, 33, 7 if args.quick else 1): + for k in range(2, 65, 23 if args.quick else 1): success &= verify_intersection(args.path, args.interpath, k, complements, m) print("") print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path) for complements in [True, False]: for m in range(1, 4): - for k in range(2, 33, 3 if args.quick else 1): + for k in range(2, 65, 11 if args.quick else 1): success &= verify_instance(args.path, k, complements, m) print("") diff --git a/src/khash_utils.h b/src/khash_utils.h index 3e5bfe0..648faaa 100644 --- a/src/khash_utils.h +++ b/src/khash_utils.h @@ -18,60 +18,78 @@ typedef unsigned char byte; -#ifdef LARGE_KMERS - // Use 128-bit integers for k-mers to allow for larger k. - KHASH_SET_INIT_INT128(S64) - KHASH_MAP_INIT_INT128(P64, size_t) - KHASH_MAP_INIT_INT128(O64, size_t) -#else - // Use 64-bits integers for k-mers for faster operations and less memory usage. - KHASH_MAP_INIT_INT64(S64, byte) - KHASH_MAP_INIT_INT64(P64, size_t) - KHASH_MAP_INIT_INT64(O64, size_t) -#endif +KHASH_MAP_INIT_INT128(S128M, byte) +KHASH_MAP_INIT_INT64(S64M, byte) +KHASH_SET_INIT_INT128(S128S) +KHASH_SET_INIT_INT64(S64S) byte MINIMUM_ABUNDANCE = 1; -/// Determine whether the canonical k-mer is present. -bool containsKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) { - if (complements) kMer = CanonicalKMer(kMer, k); - bool contains_key = kh_get_S64(kMers, kMer) != kh_end(kMers); - if (MINIMUM_ABUNDANCE == 1) return contains_key; - if (!contains_key) return false; - return kh_val(kMers, kh_get_S64(kMers, kMer)) >= MINIMUM_ABUNDANCE; -} - -/// Remove the canonical k-mer from the set. -void eraseKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) { - if (complements) kMer = CanonicalKMer(kMer, k); - auto key = kh_get_S64(kMers, kMer); - if (key != kh_end(kMers)) { - kh_del_S64(kMers, key); - } -} - -/// Insert the canonical k-mer into the set. -void insertKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements, bool force = false) { - if (complements) kMer = CanonicalKMer(kMer, k); - int ret; - if (MINIMUM_ABUNDANCE == (byte)1) { - kh_put_S64(kMers, kMer, &ret); - } else { - byte value = 0; - khint_t key; - if (kh_get_S64(kMers, kMer) != kh_end(kMers)) { - key = kh_get_S64(kMers, kMer); - value = kh_val(kMers, key); - } else { - key = kh_put_S64(kMers, kMer, &ret); - } - if (force || value == (byte)255) kh_value(kMers, key) = (byte)255; - else kh_value(kMers, key) = value + 1; - } -} +// Forward definition for the macro. +template +void differenceInPlace(KHT* kMerSet, KHT* intersection, int k, bool complements); + +#define INIT_KHASH_UTILS(type, variant) \ + \ +/* Data for parallel computation of set differences. */ \ +struct DifferenceInPlaceData##variant { \ + std::vector kMerSets; \ + kh_S##variant##_t* intersection; \ + int k; \ + bool complements; \ +}; \ + \ +/* Determine whether the canonical k-mer is present.*/ \ +bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \ + if (complements) kMer = CanonicalKMer(kMer, k); \ + bool contains_key = kh_get_S##variant(kMers, kMer) != kh_end(kMers); \ + if (MINIMUM_ABUNDANCE == 1) return contains_key; \ + if (!contains_key) return false; \ + return kh_val(kMers, kh_get_S##variant(kMers, kMer)) >= MINIMUM_ABUNDANCE; \ +} \ + \ +/* Remove the canonical k-mer from the set.*/ \ +void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \ + if (complements) kMer = CanonicalKMer(kMer, k); \ + auto key = kh_get_S##variant(kMers, kMer); \ + if (key != kh_end(kMers)) { \ + kh_del_S##variant(kMers, key); \ + } \ +} \ + \ +/* Insert the canonical k-mer into the set. */ \ +void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force = false) { \ + if (complements) kMer = CanonicalKMer(kMer, k); \ + int ret; \ + if (MINIMUM_ABUNDANCE == (byte)1) { \ + kh_put_S##variant(kMers, kMer, &ret); \ + } else { \ + byte value = 0; \ + khint_t key = kh_get_S##variant(kMers, kMer); \ + if (key != kh_end(kMers)) { \ + value = kh_val(kMers, key); \ + } else { \ + key = kh_put_S##variant(kMers, kMer, &ret); \ + } \ + if (force || value == (byte)255) kh_value(kMers, key) = (byte)255; \ + else kh_value(kMers, key) = value + 1; \ + } \ +} \ + \ +/* Parallel wrapper for differenceInPlace. */ \ +void DifferenceInPlaceThread##variant(void *arg, long i, int _) { \ + auto data = (DifferenceInPlaceData##variant*)arg; \ + differenceInPlace(data->kMerSets[i], data->intersection, data->k, data->complements); \ +} \ + +INIT_KHASH_UTILS(64, 64S) +INIT_KHASH_UTILS(64, 64M) +INIT_KHASH_UTILS(128, 128S) +INIT_KHASH_UTILS(128, 128M) /// Return the next k-mer in the k-mer set and update the index. -kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex, kmer_t &kMer) { +template +kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) { for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) { if (!kh_exist(kMers, i)) continue; kMer = kh_key(kMers, i); @@ -83,24 +101,24 @@ kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex, kmer_t &kMer) { return false; } -/// Construct a vector of the k-mer set in an arbitrary order. -std::vector kMersToVec(kh_S64_t *kMers) { - std::vector res(kh_size(kMers)); +/// Construct a vector of the k-mer set in an arbitrary order. Only for testing. +std::vector kMersToVec(kh_S64M_t *kMers) { + std::vector result(kh_size(kMers)); size_t index = 0; for (auto i = kh_begin(kMers); i != kh_end(kMers); ++i) { if (!kh_exist(kMers, i)) continue; if (MINIMUM_ABUNDANCE > 1) if (!containsKMer(kMers, kh_key(kMers, i), 0, false)) continue; - res[index++] = kh_key(kMers, i); + result[index++] = kh_key(kMers, i); } - res.resize(index); - return res; + result.resize(index); + return result; } /// Compute the intersection of several k-mer sets. -kh_S64_t *getIntersection(std::vector &kMerSets, int k, bool complements) { - kh_S64_t* result = kh_init_S64(); +template +KHT *getIntersection(KHT* result, std::vector &kMerSets, int k, bool complements) { if (kMerSets.size() < 2) return result; - kh_S64_t* smallestSet = kMerSets[0]; + KHT* smallestSet = kMerSets[0]; for (size_t i = 1; i < kMerSets.size(); ++i) { if (kh_size(kMerSets[i]) < kh_size(smallestSet)) smallestSet = kMerSets[i]; } @@ -123,7 +141,8 @@ kh_S64_t *getIntersection(std::vector &kMerSets, int k, bool compleme } /// Subtract the intersection from each k-mer set. -void differenceInPlace(kh_S64_t* kMerSet, kh_S64_t* intersection, int k, bool complements) { +template +void differenceInPlace(KHT* kMerSet, KHT* intersection, int k, bool complements) { for (auto i = kh_begin(intersection); i != kh_end(intersection); ++i) { if (!kh_exist(intersection, i)) continue; auto kMer = kh_key(intersection, i); @@ -131,16 +150,4 @@ void differenceInPlace(kh_S64_t* kMerSet, kh_S64_t* intersection, int k, bool co } } -/// Data for parallel computation of set differences. -struct DifferenceInPlaceData { - std::vector kMerSets; - kh_S64_t* intersection; - int k; - bool complements; -}; - -/// Parallel wrapper for differenceInPlace. -void DifferenceInPlaceThread(void *arg, long i, int _) { - auto data = (DifferenceInPlaceData*)arg; - differenceInPlace(data->kMerSets[i], data->intersection, data->k, data->complements); -} \ No newline at end of file + diff --git a/src/kmers.h b/src/kmers.h index 034800a..996d115 100644 --- a/src/kmers.h +++ b/src/kmers.h @@ -4,13 +4,8 @@ #include #include -#ifdef LARGE_KMERS - typedef __uint128_t kmer_t; - constexpr int KMER_T_SIZE = 128; -#else - typedef uint64_t kmer_t; - constexpr int KMER_T_SIZE = 64; -#endif +typedef uint64_t kmer64_t; +typedef __uint128_t kmer128_t; /// Convert the given basic nucleotide to int so it can be used for indexing in AC. /// If non-existing nucleotide is given, return -1. @@ -29,11 +24,13 @@ int NucleotideToInt (char c) { } /// Compute the prefix of size d of the given k-mer. +template kmer_t BitPrefix(kmer_t kMer, int k, int d) { return kMer >> ((k - d) << kmer_t(1)); } /// Compute the suffix of size d of the given k-mer. +template kmer_t BitSuffix(kmer_t kMer, int d) { return kMer & ((kmer_t(1) << (d << kmer_t(1))) - kmer_t(1)); } @@ -53,33 +50,48 @@ struct cmask { /// Compute the reverse complement of a word. /// Copyright: Jellyfish GPL-3.0 -inline kmer_t word_reverse_complement(kmer_t w) { - typedef kmer_t U; +inline kmer64_t word_reverse_complement(kmer64_t w) { + typedef kmer64_t U; + w = ((w >> 2) & cmask::v) | ((w & cmask::v) << 2); + w = ((w >> 4) & cmask::v) | ((w & cmask::v) << 4); + w = ((w >> 8) & cmask::v) | ((w & cmask::v) << 8); + w = ((w >> 16) & cmask::v) | ((w & cmask::v) << 16); + w = ( w >> 32 ) | ( w << 32); + return ((U)-1) - w; +} +/// Compute the reverse complement of a word. +/// Copyright: Jellyfish GPL-3.0 +inline kmer128_t word_reverse_complement(kmer128_t w) { + typedef kmer128_t U; w = ((w >> 2) & cmask::v) | ((w & cmask::v) << 2); w = ((w >> 4) & cmask::v) | ((w & cmask::v) << 4); w = ((w >> 8) & cmask::v) | ((w & cmask::v) << 8); w = ((w >> 16) & cmask::v) | ((w & cmask::v) << 16); -#ifdef LARGE_KMERS w = ((w >> 32) & cmask::v) | ((w & cmask::v) << 32); w = ( w >> 64 ) | ( w << 64); -#else - w = ( w >> 32 ) | ( w << 32); -#endif return ((U)-1) - w; } -/// Get the mask to mask k-mers. -inline kmer_t MaskForK(int k) { - kmer_t mask = (((kmer_t) 1) << ((k << 1) - 1)); - return mask | (mask - 1); -} +constexpr int KMER_SIZE_64 = 64; +constexpr int KMER_SIZE_128 = 128; +#define INIT_KMERS(type) \ + \ +/* Get the mask to mask k-mers. */ \ +inline kmer##type##_t MaskForK##type(int k) { \ + kmer##type##_t mask = (((kmer##type##_t) 1) << ((k << 1) - 1)); \ + return mask | (mask - 1); \ +} \ + \ +/* Compute the reverse complement of the given k-mer. */ \ +inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) { \ + return (((kmer##type##_t)word_reverse_complement(kMer)) >> (KMER_SIZE_##type - (k << kmer##type##_t(1)))) & MaskForK##type(k); \ +} \ -/// Compute the reverse complement of the given k-mer. -inline kmer_t ReverseComplement(kmer_t kMer, int k) { - return (((kmer_t)word_reverse_complement(kMer)) >> (KMER_T_SIZE - (k << kmer_t(1)))) & MaskForK(k); -} +INIT_KMERS(64) +INIT_KMERS(128) /// Return the lexicographically smaller of the k-mer and its reverse complement. +template inline kmer_t CanonicalKMer(kmer_t kMer, int k) { kmer_t rev = ReverseComplement(kMer, k); return kMer < rev ? kMer : rev; @@ -88,11 +100,13 @@ inline kmer_t CanonicalKMer(kmer_t kMer, int k) { const char letters[4] {'A', 'C', 'G', 'T'}; /// Return the index-th nucleotide from the encoded k-mer. +template inline char NucleotideAtIndex(kmer_t encoded, int k, int index) { return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)]; } /// Convert the encoded KMer representation to string. +template std::string NumberToKMer(kmer_t encoded, int length) { std::string ret(length, 'N'); for (int i = 0; i < length; ++i) { diff --git a/src/main.cpp b/src/main.cpp index 1ec1f11..8334951 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -9,12 +9,7 @@ #include "khash_utils.h" -#ifdef LARGE_KMERS - constexpr int MAX_K = 64; - const std::string VARIANT = "(128bit k-mer variant)"; -#else - constexpr int MAX_K = 32; -#endif +constexpr int MAX_K = 64; int Help() { @@ -62,6 +57,150 @@ void TestFile(FILE *fo, std::string fn) { } } +#define INIT_RUN(type, version) \ + \ +int run##version(int32_t k, \ + std::string intersectionPath, \ + std::vector inPaths, \ + std::vector outPaths, \ + std::string statsPath, \ + FILE *fstats, \ + bool computeIntersection, \ + bool computeOutput, \ + bool verbose, \ + bool complements, \ + int threads, \ + size_t setCount) { \ + \ + if (verbose) { \ + std::cerr << "=====================" << std::endl; \ + std::cerr << "1) Loading references" << std::endl; \ + std::cerr << "=====================" << std::endl; \ + } \ + std::vector fullSets(setCount); \ + std::vector inSizes = std::vector(setCount); \ + std::vector outSizes; \ + \ + for (size_t i = 0; i < setCount; i++) { \ + fullSets[i] = kh_init_S##version(); \ + } \ + \ + ReadKMersData##version data = {fullSets, inPaths, k, complements}; \ + kt_for(threads, ReadKMersThread##version, (void*)&data, setCount); \ + \ + for (size_t i = 0; i < setCount; i++) { \ + if (verbose) { \ + std::cerr << "Loaded " << inPaths[i] << std::endl; \ + } \ + inSizes[i] = kh_size(fullSets[i]); \ + if (fstats != nullptr) { \ + fprintf(fstats,"%s\t%lu\n", inPaths[i].c_str(), inSizes[i]); \ + } \ + } \ + \ + if (verbose) { \ + std::cerr << "===============" << std::endl; \ + std::cerr << "2) Intersecting" << std::endl; \ + std::cerr << "===============" << std::endl; \ + } \ + kh_S##version##_t* intersection = kh_init_S##version(); \ + size_t intersectionSize = 0; \ + if (computeIntersection) { \ + if (verbose) { \ + std::cerr << "2.1) Computing intersection" << std::endl; \ + } \ + getIntersection(intersection, fullSets, k, complements); \ + intersectionSize = kh_size(intersection); \ + if (verbose) { \ + std::cerr << " intersection size: " << intersectionSize << std::endl; \ + } \ + if (computeOutput) { \ + if (verbose) { \ + std::cerr << "2.2) Removing this intersection from all k-mer sets" << std::endl; \ + } \ + DifferenceInPlaceData##version data = {fullSets, intersection, k, complements}; \ + kt_for(threads, DifferenceInPlaceThread##version, (void*)&data, setCount); \ + } \ + } \ + if (computeOutput) { \ + for (size_t i = 0; i < setCount; i++) { \ + outSizes.push_back(kh_size(fullSets[i])); \ + if (inSizes[i] != outSizes[i] + intersectionSize) { \ + std::cerr << "Internal error: k-mer set sizes do not correspond " \ + << inSizes[i] << " != " << outSizes[i] << " + " << intersectionSize << std::endl; \ + return 1; \ + } \ + if (verbose){ \ + std::cerr << inSizes[i] << " " << outSizes[i] << " ...inter:" << intersectionSize << std::endl; \ + } \ + } \ + } \ + \ + if (verbose){ \ + std::cerr << "=============" << std::endl; \ + std::cerr << "3) Assembling" << std::endl; \ + std::cerr << "=============" << std::endl; \ + } \ + if (computeOutput) { \ + std::vector ofs (setCount); \ + std::vector filestreams (setCount); \ + \ + for (size_t i = 0; i < setCount; i++) { \ + if (outPaths[i] != "-") { \ + filestreams[i] = std::ofstream(outPaths[i]); \ + ofs[i] = &filestreams[i]; \ + } else { \ + ofs[i] = &std::cout; \ + } \ + if (fstats) { \ + fprintf(fstats,"%s\t%lu\n", outPaths[i].c_str(), outSizes[i]); \ + } \ + } \ + ComputeSimplitigsData##version data = {fullSets, ofs, k, complements, std::vector(setCount)}; \ + kt_for(threads, ComputeSimplitigsThread##version, (void*)&data, setCount); \ + \ + for (size_t i = 0; i < setCount; i++) { \ + if (verbose) { \ + std::cerr << " assembly finished (" << data.simplitigsCounts[i] << " contigs)" << std::endl; \ + } \ + if (filestreams[i].is_open()) { \ + filestreams[i].close(); \ + } \ + } \ + } \ + if (computeIntersection) { \ + std::ostream *of; \ + std::ofstream filestream; \ + if (intersectionPath != "-") { \ + filestream = std::ofstream(intersectionPath); \ + of = &filestream; \ + } \ + else { \ + of = &std::cout; \ + } \ + if (fstats) { \ + fprintf(fstats,"%s\t%lu\n", intersectionPath.c_str(), intersectionSize); \ + } \ + int simplitigCount = ComputeSimplitigs(intersection, *of, k, complements); \ + if (verbose) { \ + std::cerr << " assembly finished (" << simplitigCount << " contigs)" << std::endl; \ + } \ + if (filestream.is_open()) { \ + filestream.close(); \ + } \ + } \ + if (fstats){ \ + fclose(fstats); \ + } \ + return 0; \ +} \ + + +INIT_RUN(64, 64S) +INIT_RUN(64, 64M) +INIT_RUN(128, 128S) +INIT_RUN(128, 128M) + int main(int argc, char **argv) { int32_t k = -1; @@ -176,137 +315,25 @@ int main(int argc, char **argv) { std::cerr << "Number of threads is greater than the number of input sets. Using " << threads << " threads instead." << std::endl; } - if (fstats) { fprintf(fstats,"# cmd: %s",argv[0]); - for (int32_t i = 1; i < argc; i++){ fprintf(fstats," %s",argv[i]); } fprintf(fstats,"\n"); } - if (verbose) { - std::cerr << "=====================" << std::endl; - std::cerr << "1) Loading references" << std::endl; - std::cerr << "=====================" << std::endl; - } - std::vector fullSets(setCount); - std::vector inSizes = std::vector(setCount); - std::vector outSizes; - - - - - for (size_t i = 0; i < setCount; i++) { - fullSets[i] = kh_init_S64(); - } - - ReadKMersData data = {fullSets, inPaths, k, complements}; - kt_for(threads, ReadKMersThread, (void*)&data, setCount); - - for (size_t i = 0; i < setCount; i++) { - if (verbose) { - std::cerr << "Loaded " << inPaths[i] << std::endl; - } - inSizes[i] = kh_size(fullSets[i]); - if (fstats != nullptr) { - fprintf(fstats,"%s\t%lu\n", inPaths[i].c_str(), inSizes[i]); + if (k <= 32) { + if (MINIMUM_ABUNDANCE == (byte)1) { + return run64S(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount); + } else { + return run64M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount); } - } - - if (verbose) { - std::cerr << "===============" << std::endl; - std::cerr << "2) Intersecting" << std::endl; - std::cerr << "===============" << std::endl; - } - kh_S64_t* intersection = nullptr; - size_t intersectionSize = 0; - if (computeIntersection) { - if (verbose) { - std::cerr << "2.1) Computing intersection" << std::endl; + } else { + if (MINIMUM_ABUNDANCE == (byte)1) { + return run128S(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount); + } else { + return run128M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount); } - intersection = getIntersection(fullSets, k, complements); - intersectionSize = kh_size(intersection); - if (verbose) { - std::cerr << " intersection size: " << intersectionSize << std::endl; - } - if (computeOutput) { - if (verbose) { - std::cerr << "2.2) Removing this intersection from all k-mer sets" << std::endl; - } - DifferenceInPlaceData data = {fullSets, intersection, k, complements}; - kt_for(threads, DifferenceInPlaceThread, (void*)&data, setCount); - } - } - if (computeOutput) { - for (size_t i = 0; i < setCount; i++) { - outSizes.push_back(kh_size(fullSets[i])); - if (inSizes[i] != outSizes[i] + intersectionSize) { - std::cerr << "Internal error: k-mer set sizes do not correspond " << inSizes[i] << " != " << outSizes[i] << " + " << intersectionSize << std::endl; - return 1; - } - if (verbose){ - std::cerr << inSizes[i] << " " << outSizes[i] << " ...inter:" << intersectionSize << std::endl; - } - } - } - - if (verbose){ - std::cerr << "=============" << std::endl; - std::cerr << "3) Assembling" << std::endl; - std::cerr << "=============" << std::endl; - } - if (computeOutput) { - std::vector ofs (setCount); - std::vector filestreams (setCount); - - for (size_t i = 0; i < setCount; i++) { - if (outPaths[i] != "-") { - filestreams[i] = std::ofstream(outPaths[i]); - ofs[i] = &filestreams[i]; - } else { - ofs[i] = &std::cout; - } - if (fstats) { - fprintf(fstats,"%s\t%lu\n", outPaths[i].c_str(), outSizes[i]); - } - } - ComputeSimplitigsData data = {fullSets, ofs, k, complements, std::vector(setCount)}; - kt_for(threads, ComputeSimplitigsThread, (void*)&data, setCount); - - for (size_t i = 0; i < setCount; i++) { - if (verbose) { - std::cerr << " assembly finished (" << data.simplitigsCounts[i] << " contigs)" << std::endl; - } - if (filestreams[i].is_open()) { - filestreams[i].close(); - } - } - } - if (computeIntersection) { - std::ostream *of; - std::ofstream filestream; - if (intersectionPath != "-") { - filestream = std::ofstream(intersectionPath); - of = &filestream; - } - else { - of = &std::cout; - } - if (fstats) { - fprintf(fstats,"%s\t%lu\n", intersectionPath.c_str(), intersectionSize); - } - int simplitigCount = ComputeSimplitigs(intersection, *of, k, complements); - if (verbose) { - std::cerr << " assembly finished (" << simplitigCount << " contigs)" << std::endl; - } - if (filestream.is_open()) { - filestream.close(); - } - } - if (fstats){ - fclose(fstats); } - return 0; } diff --git a/src/parser.h b/src/parser.h index e58f74f..97aaf42 100644 --- a/src/parser.h +++ b/src/parser.h @@ -9,67 +9,76 @@ #include "khash_utils.h" -/// Read encoded k-mers from the given fasta file. -/// Return unique k-mers in no particular order. -/// If complements is set to true, the result contains only one of the complementary k-mers - it is not guaranteed which one. -/// This runs in O(sequence length) expected time. -void ReadKMers(kh_S64_t *kMers, std::string &path, int k, bool complements) { - std::ifstream filestream; - std::istream *fasta; - if (path == "-") { - fasta = &std::cin; - } else { - filestream.open(path); - fasta = &filestream; - if (!filestream.is_open()) { - std::cerr << "Error: file '" << path << "' could not be open." << std::endl; - exit(1); - } - } - char c; - int beforeKMerEnd = k; - kmer_t currentKMer = 0; - // mask that works even for k=32. - kmer_t mask = (((kmer_t) 1) << (2 * k - 1)); - mask |= mask - 1; - bool readingHeader = false; - while ((*fasta) >> std::noskipws >> c) { - if (c == '>') { - readingHeader = true; - currentKMer = 0; - beforeKMerEnd = k; - } - else if (c == '\n') readingHeader = false; - if (readingHeader) continue; - auto data = NucleotideToInt(c); - // Disregard white space. - if (c == '\n' || c == '\r' || c == ' ') continue; - if (data == -1) { - currentKMer = 0; - beforeKMerEnd = k; - continue; - } - currentKMer <<= 2; - currentKMer &= mask; - currentKMer |= data; - if(beforeKMerEnd > 0) --beforeKMerEnd; - if (beforeKMerEnd == 0) { - insertKMer(kMers, currentKMer, k, complements); - } - } - if (filestream.is_open()) filestream.close(); -} +#define INIT_PARSER(type, variant) \ +/* Read encoded k-mers from the given fasta file. \ + * Return unique k-mers in no particular order. \ + * If complements is set to true, the result contains only one of the complementary k-mers \ + * - it is not guaranteed which one. \ + * This runs in O(sequence length) expected time. \ + */ \ +void ReadKMers(kh_S##variant##_t *kMers, std::string &path, int k, bool complements) { \ + std::ifstream filestream; \ + std::istream *fasta; \ + if (path == "-") { \ + fasta = &std::cin; \ + } else { \ + filestream.open(path); \ + fasta = &filestream; \ + if (!filestream.is_open()) { \ + std::cerr << "Error: file '" << path << "' could not be open." << std::endl; \ + exit(1); \ + } \ + } \ + char c; \ + int beforeKMerEnd = k; \ + kmer##type##_t currentKMer = 0; \ + /* mask that works even for k=32. */ \ + kmer##type##_t mask = (((kmer##type##_t) 1) << (2 * k - 1)); \ + mask |= mask - 1; \ + bool readingHeader = false; \ + while ((*fasta) >> std::noskipws >> c) { \ + if (c == '>') { \ + readingHeader = true; \ + currentKMer = 0; \ + beforeKMerEnd = k; \ + } \ + else if (c == '\n') readingHeader = false; \ + if (readingHeader) continue; \ + auto data = NucleotideToInt(c); \ + /* Disregard white space. */ \ + if (c == '\n' || c == '\r' || c == ' ') continue; \ + if (data == -1) { \ + currentKMer = 0; \ + beforeKMerEnd = k; \ + continue; \ + } \ + currentKMer <<= 2; \ + currentKMer &= mask; \ + currentKMer |= data; \ + if(beforeKMerEnd > 0) --beforeKMerEnd; \ + if (beforeKMerEnd == 0) { \ + insertKMer(kMers, currentKMer, k, complements); \ + } \ + } \ + if (filestream.is_open()) filestream.close(); \ +} \ + \ + \ +/* Data for parallel reading of k-mers. */ \ +struct ReadKMersData##variant { \ + std::vector kMers; \ + std::vector paths; \ + int k; \ + bool complements; \ +}; \ + \ +/* Parallel wrapper for ReadKMers. */ \ +void ReadKMersThread##variant(void *arg, long i, int _) { \ + auto *data = (ReadKMersData##variant *) arg; \ + ReadKMers(data->kMers[i], data->paths[i], data->k, data->complements); \ +} \ -/// Data for parallel reading of k-mers. -struct ReadKMersData { - std::vector kMers; - std::vector paths; - int k; - bool complements; -}; - -/// Parallel wrapper for ReadKMers. -void ReadKMersThread(void *arg, long i, int _) { - auto *data = (ReadKMersData *) arg; - ReadKMers(data->kMers[i], data->paths[i], data->k, data->complements); -} +INIT_PARSER(64, 64S) +INIT_PARSER(64, 64M) +INIT_PARSER(128, 128S) +INIT_PARSER(128, 128M) diff --git a/src/prophasm.h b/src/prophasm.h index 6d01f1a..6f033d6 100644 --- a/src/prophasm.h +++ b/src/prophasm.h @@ -14,7 +14,8 @@ /// Find the right extension to the provided last k-mer from the kMers by trying to append each of {A, C, G, T}. /// Return the extension - that is the d chars extending the simplitig - and the extending kMer. -std::pair RightExtension(kmer_t last, kh_S64_t *kMers, int k, bool complements) { +template +std::pair RightExtension(kmer_t last, KHT *kMers, int k, bool complements) { for (kmer_t ext = 0; ext < 4; ++ext) { kmer_t next = (BitSuffix(last, k - 1) << 2) | ext; if (containsKMer(kMers, next, k, complements)) { @@ -26,7 +27,8 @@ std::pair RightExtension(kmer_t last, kh_S64_t *kMers, int k, bo /// Find the left extension to the provided first k-mer from the kMers by trying to prepend each of {A, C, G, T}. /// Return the extension - that is the d chars extending the simplitig - and the extending kMer. -std::pair LeftExtension(kmer_t first, kh_S64_t *kMers, int k, bool complements) { +template +std::pair LeftExtension(kmer_t first, KHT *kMers, int k, bool complements) { for (kmer_t ext = 0; ext < 4; ++ext) { kmer_t next = (ext << ((k - 1) << 1)) | BitPrefix(first, k, k - 1); if (containsKMer(kMers, next, k, complements)) { @@ -39,7 +41,8 @@ std::pair LeftExtension(kmer_t first, kh_S64_t *kMers, int k, b /// Find the next simplitig. /// Also remove the used k-mers from kMers. /// If complements are true, it is expected that kMers only contain one k-mer from a complementary pair. -void NextSimplitig(kh_S64_t *kMers, kmer_t begin, std::ostream& of, int k, bool complements, int simplitigID) { +template +void NextSimplitig(KHT *kMers, kmer_t begin, std::ostream& of, int k, bool complements, int simplitigID) { // Maintain the first and last k-mer in the simplitig. kmer_t last = begin, first = begin; std::string initialKMer = NumberToKMer(begin, k); @@ -77,36 +80,44 @@ void NextSimplitig(kh_S64_t *kMers, kmer_t begin, std::ostream& of, int k, bool of << std::string(simplitig.begin(), simplitig.end()) << std::endl; } -/// Heuristically compute simplitigs. -/// -/// If complements are provided, treat k-mer and its complement as identical. -/// If this is the case, k-mers are expected not to contain both k-mer and its complement. -/// Warning: this will destroy kMers. -int ComputeSimplitigs(kh_S64_t *kMers, std::ostream& of, int k, bool complements) { - size_t lastIndex = 0; - kmer_t begin = 0; - int simplitigID = 0; - while(true) { - bool found = nextKMer(kMers, lastIndex, begin); - // No more k-mers. - if (!found) return simplitigID; - NextSimplitig(kMers, begin, of, k, complements, simplitigID++); - } -} -/// Data for parallel computation of simplitigs. -struct ComputeSimplitigsData { - std::vector kMers; - std::vector ofs; - int k; - bool complements; - std::vector simplitigsCounts; -}; - -/// Parallel wrapper for ComputeSimplitigs. -void ComputeSimplitigsThread(void *arg, long i, int _) { - auto *data = (ComputeSimplitigsData *) arg; - data->simplitigsCounts[i] = ComputeSimplitigs(data->kMers[i], *data->ofs[i], data->k, data->complements); -} +#define INIT_PROPHASM(type, variant) \ + \ +/* Heuristically compute simplitigs. \ + * \ + * If complements are provided, treat k-mer and its complement as identical. \ + * If this is the case, k-mers are expected not to contain both k-mer and its complement. \ + * Warning: this will destroy kMers. \ + */ \ +int ComputeSimplitigs(kh_S##variant##_t *kMers, std::ostream& of, int k, bool complements) { \ + size_t lastIndex = 0; \ + kmer##type##_t begin = 0; \ + int simplitigID = 0; \ + while(true) { \ + bool found = nextKMer(kMers, lastIndex, begin); \ + /* No more k-mers. */ \ + if (!found) return simplitigID; \ + NextSimplitig(kMers, begin, of, k, complements, simplitigID++); \ + } \ +} \ + \ +/* Data for parallel computation of simplitigs. */ \ +struct ComputeSimplitigsData##variant { \ + std::vector kMers; \ + std::vector ofs; \ + int k; \ + bool complements; \ + std::vector simplitigsCounts; \ +}; \ + \ +/* Parallel wrapper for ComputeSimplitigs. */ \ +void ComputeSimplitigsThread##variant(void *arg, long i, int _) { \ + auto *data = (ComputeSimplitigsData##variant *) arg; \ + data->simplitigsCounts[i] = ComputeSimplitigs(data->kMers[i], *data->ofs[i], data->k, data->complements); \ +} \ +INIT_PROPHASM(64, 64S) +INIT_PROPHASM(64, 64M) +INIT_PROPHASM(128, 128S) +INIT_PROPHASM(128, 128M) diff --git a/tests/khash_utils_unittest.h b/tests/khash_utils_unittest.h index 0543f35..49d9741 100644 --- a/tests/khash_utils_unittest.h +++ b/tests/khash_utils_unittest.h @@ -16,9 +16,9 @@ namespace { }; for (auto t: tests) { - auto kMers = kh_init_S64(); + auto kMers = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(kMers, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(kMers, kMer, &ret); auto got = kMersToVec(kMers); sort(t.kMers.begin(), t.kMers.end()); @@ -45,14 +45,15 @@ namespace { }; for (auto t : tests) { - std::vector input (t.kMerSets.size()); + std::vector input (t.kMerSets.size()); for (size_t i = 0; i < t.kMerSets.size(); ++i) { - input[i] = kh_init_S64(); + input[i] = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMerSets[i]) kh_put_S64(input[i], kMer, &ret); + for (auto &&kMer : t.kMerSets[i]) kh_put_S64M(input[i], kMer, &ret); } - auto gotResult = getIntersection(input, t.k, t.complements); + kh_S64M_t * gotResult = kh_init_S64M(); + getIntersection(gotResult, input, t.k, t.complements); auto gotResultVec = kMersToVec(gotResult); sort(t.wantResult.begin(), t.wantResult.end()); sort(gotResultVec.begin(), gotResultVec.end()); @@ -84,11 +85,11 @@ namespace { }; for (auto t : tests) { - kh_S64_t * input = kh_init_S64(); + kh_S64M_t * input = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(input, kMer, &ret); - auto intersection = kh_init_S64(); - for (auto &&kMer : t.toRemove) kh_put_S64(intersection, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(input, kMer, &ret); + auto intersection = kh_init_S64M(); + for (auto &&kMer : t.toRemove) kh_put_S64M(intersection, kMer, &ret); differenceInPlace(input, intersection, t.k, t.complements); @@ -119,7 +120,7 @@ namespace { for (auto t : tests) { // Turn off optimizations for MINIMUM_ABUNDANCE being 1. MINIMUM_ABUNDANCE = 255; - kh_S64_t * input = kh_init_S64(); + kh_S64M_t * input = kh_init_S64M(); for (auto &&[abundance, kMer] : t.kMers) for (byte i = 0; i < abundance; ++i) insertKMer(input, kMer, t.k, t.complements); diff --git a/tests/kmers_unittest.h b/tests/kmers_unittest.h index 2252f26..b71d776 100644 --- a/tests/kmers_unittest.h +++ b/tests/kmers_unittest.h @@ -49,7 +49,7 @@ namespace { TEST(KMers, NumberToKMer) { struct TestCase { - kmer_t encoded; + kmer_t encoded ; int d; std::string wantResult; }; @@ -67,7 +67,7 @@ namespace { } } - TEST(KMers, MaskForK) { + TEST(KMers, MaskForK64) { struct TestCase { int k; kmer_t wantResult; @@ -80,7 +80,7 @@ namespace { }; for (auto t: tests) { - kmer_t gotResult = MaskForK(t.k); + kmer_t gotResult = MaskForK64(t.k); EXPECT_EQ(t.wantResult, gotResult); } diff --git a/tests/prophasm_unittest.h b/tests/prophasm_unittest.h index 1e53a93..f73a2c9 100644 --- a/tests/prophasm_unittest.h +++ b/tests/prophasm_unittest.h @@ -21,9 +21,9 @@ namespace { }; for (auto t: tests) { - auto kMers = kh_init_S64(); + auto kMers = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(kMers, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(kMers, kMer, &ret); auto got = RightExtension(t.last, kMers, t.k, t.complements); auto gotExt = got.first; @@ -51,9 +51,9 @@ namespace { }; for (auto t: tests) { - auto kMers = kh_init_S64(); + auto kMers = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(kMers, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(kMers, kMer, &ret); auto got = LeftExtension(t.first, kMers, t.k, t.complements); auto gotExt = got.first; @@ -85,9 +85,9 @@ namespace { for (auto &&t: tests) { std::stringstream of; - auto kMers = kh_init_S64(); + auto kMers = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(kMers, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(kMers, kMer, &ret); NextSimplitig(kMers, t.kMers.front(), of, t.k, t.complements, t.simplitigID); auto remainingKmers = kMersToVec(kMers); @@ -116,9 +116,9 @@ namespace { for (auto t: tests) { std::stringstream of; - auto kMers = kh_init_S64(); + auto kMers = kh_init_S64M(); int ret; - for (auto &&kMer : t.kMers) kh_put_S64(kMers, kMer, &ret); + for (auto &&kMer : t.kMers) kh_put_S64M(kMers, kMer, &ret); ComputeSimplitigs(kMers, of, t.k, t.complements); diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 8fa1759..dca659b 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -1,3 +1,6 @@ +#include "../src/kmers.h" +typedef kmer64_t kmer_t; + #include "kmers_unittest.h" #include "prophasm_unittest.h" #include "khash_utils_unittest.h"