Merge pull request #72 from OndrejSladky/refactor-ac

Refactoring AC code to a separate directory to simplify code structure.
OndrejSladky · Aug 21, 2024 · b067518 · b067518
2 parents 21c262f + 0d311e2
commit b067518
Show file tree

Hide file tree

Showing 19 changed files with 144 additions and 148 deletions.
diff --git a/src/ac_automaton.h → src/ac/ac_automaton.h b/src/ac_automaton.h → src/ac/ac_automaton.h
@@ -3,7 +3,7 @@
 #include <list>
 #include <vector>
 
-#include "models.h"
+#include "kmers_ac.h"
 
 constexpr int INVALID_STATE = -1;
 struct ACState {

diff --git a/src/global_ac.h → src/ac/global_ac.h b/src/global_ac.h → src/ac/global_ac.h
@@ -10,8 +10,7 @@
 #include <fstream>
 #include <algorithm>
 
-#include "models.h"
-#include "kmers.h"
+#include "kmers_ac.h"
 #include "ac_automaton.h"
 
 

diff --git a/src/ac/kmers_ac.h b/src/ac/kmers_ac.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <string>
+#include <vector>
+
+struct KMer {
+    std::string value;
+    size_t length() const{ return value.size(); }
+};
+
+/// Return the complementary nucleotide for the given one.
+char ComplementaryNucleotide(const char nucleotide) {
+    if (nucleotide == 'A') return 'T';
+    else if (nucleotide == 'T') return 'A';
+    else if (nucleotide == 'G') return 'C';
+    else if (nucleotide == 'C') return 'G';
+    throw std::invalid_argument("cannot find complementary nucleotide for letter " + std::string(1, nucleotide));
+}
+
+/// Compute the reverse complement of the given k-mer.
+KMer ReverseComplement(const KMer &kMer) {
+    KMer ans;
+    ans.value.reserve(kMer.length());
+    for (int i = (int)kMer.length() - 1; i >= 0; --i) {
+        ans.value += ComplementaryNucleotide(kMer.value[i]);
+    }
+    return ans;
+}
+
+/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
+/// If non-existing nucleotide is given, return -1.
+int NucleotideToInt (char c) {
+    switch (c) {
+        case 'A': return 0;
+        case 'C': return 1;
+        case 'G': return 2;
+        case 'T': return 3;
+        case 'a': return 0;
+        case 'c': return 1;
+        case 'g': return 2;
+        case 't': return 3;
+        default: return -1;
+    }
+}
+
diff --git a/src/local_ac.h → src/ac/local_ac.h b/src/local_ac.h → src/ac/local_ac.h
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <fstream>
 
-#include "models.h"
+#include "kmers_ac.h"
 #include "ac_automaton.h"
 
 /// Find the index of the first extending k-mer from the incidentKMers which is not forbidden.

diff --git a/src/ac/parser_ac.h b/src/ac/parser_ac.h
@@ -0,0 +1,79 @@
+#pragma once
+
+// Implicitly initialize kseq.
+#include "../parser.h"
+
+#include "kmers_ac.h"
+
+/// Record of one fasta sequence.
+struct FastaRecord {
+    // The name of the record. Starts with '>'.
+    std::string name;
+    // The genetic sequence.
+    std::string sequence;
+};
+
+/// Read fasta file with given path.
+std::vector<FastaRecord> ReadFasta(std::string &path) {
+    gzFile fp;
+    kseq_t *seq;
+    std::vector<FastaRecord> records;
+
+    fp = gzopen(path.c_str(), "r");
+    if (fp == nullptr) {
+        throw std::invalid_argument("couldn't open file " + path);
+    }
+    seq = kseq_init(fp);
+    while (kseq_read(seq) >= 0) {
+        records.push_back(FastaRecord{
+                seq->name.s,
+                seq->seq.s,
+        });
+    }
+    kseq_destroy(seq);
+    gzclose(fp);
+    return records;
+}
+
+/// Create a list of unique k-mers in no particular order.
+/// This runs in O(k*data.size) expected time.
+void AddKMersFromSequence(std::unordered_set<std::string> &kMers, std::string &data, int k) {
+    // Convert the sequence to uppercase letters.
+    std::transform(data.begin(), data.end(), data.begin(), toupper);
+    size_t possibleKMerEnd = k;
+    for (size_t i = 1; i <= data.size(); ++i) {
+        if (data[i-1] != 'A' && data[i-1] != 'C' && data[i-1] != 'G' && data[i-1] != 'T') {
+            // Skip this and the next k-1 k-mers.
+            possibleKMerEnd = i + k;
+        }
+        if (i >= possibleKMerEnd) {
+            kMers.insert(data.substr(i - k, k));
+        }
+    }
+}
+
+/// Return only the subset of k-mers where no two k-mers are complements of one another.
+/// The k-mers are chosen with no particular property.
+std::unordered_set<std::string> FilterKMersWithComplement(std::unordered_set<std::string> &kMers) {
+    std::unordered_set<std::string> ret;
+    for (auto &&x : kMers) {
+        auto kMer = KMer{x};
+        if (ret.count(ReverseComplement(kMer).value) == 0) ret.insert(x);
+    }
+    return ret;
+}
+
+/// Create a list of unique k-mers in no particular order.
+/// This runs in O(k*data.size) expected time.
+std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool complements) {
+    std::unordered_set<std::string> uniqueKMers;
+    for (auto &&record : data) {
+        AddKMersFromSequence(uniqueKMers, record.sequence, k);
+    }
+    if (complements) uniqueKMers = FilterKMersWithComplement(uniqueKMers);
+    std::vector<KMer> result;
+    for (auto &kMer : uniqueKMers) {
+        result.push_back(KMer{kMer});
+    }
+    return result;
+}
diff --git a/src/streaming.h → src/ac/streaming.h b/src/streaming.h → src/ac/streaming.h
@@ -5,7 +5,7 @@
 #include <cassert>
 #include <algorithm>
 
-#include "kmers.h"
+#include "../kmers.h"
 
 void Push(std::ostream &of, const std::string &current, int k, int32_t used) {
     if (current.size() == static_cast<size_t>(k) && used != 0) {

diff --git a/src/global.h b/src/global.h
@@ -7,7 +7,6 @@
 #include <algorithm>
 #include <cstdint>
 
-#include "models.h"
 #include "kmers.h"
 #include "khash.h"
 #include "khash_utils.h"

diff --git a/src/kmers.h b/src/kmers.h
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <cstdint>
 
-#include "models.h"
+#include "ac/kmers_ac.h"
 
 #ifdef LARGE_KMERS
     typedef __uint128_t kmer_t;
@@ -14,22 +14,6 @@
     constexpr int KMER_T_SIZE = 64;
 #endif
 
-/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
-/// If non-existing nucleotide is given, return -1.
-int NucleotideToInt (char c) {
-    switch (c) {
-        case 'A': return 0;
-        case 'C': return 1;
-        case 'G': return 2;
-        case 'T': return 3;
-        case 'a': return 0;
-        case 'c': return 1;
-        case 'g': return 2;
-        case 't': return 3;
-        default: return -1;
-    }
-}
-
 
 static const uint8_t nucleotideToInt[] = {
 		4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
@@ -105,32 +89,8 @@ kmer_t ReverseComplement(kmer_t kMer, int k) {
     return (((kmer_t)word_reverse_complement(kMer)) >> (KMER_T_SIZE - (k << kmer_t(1)))) & ((kmer_t(1) << (k << kmer_t (1))) - kmer_t(1));
 }
 
-/// Return the complementary nucleotide for the given one.
-char ComplementaryNucleotide(const char nucleotide) {
-    if (nucleotide == 'A') return 'T';
-    else if (nucleotide == 'T') return 'A';
-    else if (nucleotide == 'G') return 'C';
-    else if (nucleotide == 'C') return 'G';
-    throw std::invalid_argument("cannot find complementary nucleotide for letter " + std::string(1, nucleotide));
-}
-
-/// Compute the reverse complement of the given k-mer.
-KMer ReverseComplement(const KMer &kMer) {
-    KMer ans;
-    ans.value.reserve(kMer.length());
-    for (int i = (int)kMer.length() - 1; i >= 0; --i) {
-        ans.value += ComplementaryNucleotide(kMer.value[i]);
-    }
-    return ans;
-}
-
 const char letters[4] {'A', 'C', 'G', 'T'};
 
-/// Get the uppercase version of the character.
-inline char UpperToLower(char c) {
-    return c - 'A' + 'a';
-}
-
 /// Return the index-th nucleotide from the encoded k-mer.
 inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
     return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];

diff --git a/src/local.h b/src/local.h
@@ -8,7 +8,6 @@
 #include <algorithm>
 #include <fstream>
 
-#include "models.h"
 #include "kmers.h"
 #include "khash_utils.h"
 

diff --git a/src/main.cpp b/src/main.cpp
@@ -3,13 +3,13 @@
 
 #include "unistd.h"
 #include "version.h"
-#include "global_ac.h"
+#include "ac/global_ac.h"
 #include "global.h"
 #include "local.h"
-#include "local_ac.h"
+#include "ac/local_ac.h"
 #include "parser.h"
-#include "streaming.h"
-#include "output.h"
+#include "ac/parser_ac.h"
+#include "ac/streaming.h"
 #include "khash_utils.h"
 #include "masks.h"
 

diff --git a/src/models.h b/src/models.h
diff --git a/src/output.h b/src/output.h
diff --git a/src/parser.h b/src/parser.h
@@ -9,82 +9,9 @@
 #include "kseq.h"
 KSEQ_INIT(gzFile, gzread)
 
-#include "models.h"
 #include "kmers.h"
 #include "khash_utils.h"
 
-/// Record of one fasta sequence.
-struct FastaRecord {
-    // The name of the record. Starts with '>'.
-    std::string name;
-    // The genetic sequence.
-    std::string sequence;
-};
-
-/// Read fasta file with given path.
-std::vector<FastaRecord> ReadFasta(std::string &path) {
-    gzFile fp;
-    kseq_t *seq;
-    std::vector<FastaRecord> records;
-
-    fp = gzopen(path.c_str(), "r");
-    if (fp == nullptr) {
-        throw std::invalid_argument("couldn't open file " + path);
-    }
-    seq = kseq_init(fp);
-    while (kseq_read(seq) >= 0) {
-        records.push_back(FastaRecord{
-            seq->name.s,
-            seq->seq.s,
-        });
-    }
-    kseq_destroy(seq);
-    gzclose(fp);
-    return records;
-}
-
-/// Create a list of unique k-mers in no particular order.
-/// This runs in O(k*data.size) expected time.
-void AddKMersFromSequence(std::unordered_set<std::string> &kMers, std::string &data, int k) {
-    // Convert the sequence to uppercase letters.
-    std::transform(data.begin(), data.end(), data.begin(), toupper);
-    size_t possibleKMerEnd = k;
-    for (size_t i = 1; i <= data.size(); ++i) {
-        if (data[i-1] != 'A' && data[i-1] != 'C' && data[i-1] != 'G' && data[i-1] != 'T') {
-            // Skip this and the next k-1 k-mers.
-            possibleKMerEnd = i + k;
-        }
-        if (i >= possibleKMerEnd) {
-            kMers.insert(data.substr(i - k, k));
-        }
-    }
-}
-
-/// Return only the subset of k-mers where no two k-mers are complements of one another.
-/// The k-mers are chosen with no particular property.
-std::unordered_set<std::string> FilterKMersWithComplement(std::unordered_set<std::string> &kMers) {
-    std::unordered_set<std::string> ret;
-    for (auto &&x : kMers) {
-        auto kMer = KMer{x};
-        if (ret.count(ReverseComplement(kMer).value) == 0) ret.insert(x);
-    }
-    return ret;
-}
-
-/// Create a list of unique k-mers in no particular order.
-/// This runs in O(k*data.size) expected time.
-std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool complements) {
-    std::unordered_set<std::string> uniqueKMers;
-    for (auto &&record : data) {
-        AddKMersFromSequence(uniqueKMers, record.sequence, k);
-    }
-    if (complements) uniqueKMers = FilterKMersWithComplement(uniqueKMers);
-    std::vector<KMer> result;
-    for (auto &kMer : uniqueKMers) {
-        result.push_back(KMer{kMer});
-    }
-    return result;
-}
 
 /// Fill the k-mer dictionary with k-mers from the given sequence.
 /// If complements is true, always add the canonical k-mers.
@@ -161,3 +88,9 @@ void AssertEOF(kseq_t *seq, std::string message) {
         throw std::invalid_argument(message);
     }
 }
+
+/// Print the fasta file header.
+void WriteName(const int k, std::ostream &of) {
+    of << ">superstring ";
+    of << "k=" << k << std::endl;
+}
diff --git a/tests/ac_automaton_unittest.h b/tests/ac_automaton_unittest.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "../src/ac_automaton.h"
+#include "../src/ac/ac_automaton.h"
 
 #include "gtest/gtest.h"
 

diff --git a/tests/global_ac_unittest.h b/tests/global_ac_unittest.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "../src/global_ac.h"
+#include "../src/ac/global_ac.h"
 
 #include "gtest/gtest.h"
 

diff --git a/tests/kmers_unittest.h b/tests/kmers_unittest.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "../src/kmers.h"
+#include "../src/ac/kmers_ac.h"
 
 #include "gtest/gtest.h"