Skip to content

Commit

Permalink
Merge pull request #72 from OndrejSladky/refactor-ac
Browse files Browse the repository at this point in the history
Refactoring AC code to a separate directory to simplify code structure.
  • Loading branch information
OndrejSladky authored Aug 21, 2024
2 parents 21c262f + 0d311e2 commit b067518
Show file tree
Hide file tree
Showing 19 changed files with 144 additions and 148 deletions.
2 changes: 1 addition & 1 deletion src/ac_automaton.h → src/ac/ac_automaton.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <list>
#include <vector>

#include "models.h"
#include "kmers_ac.h"

constexpr int INVALID_STATE = -1;
struct ACState {
Expand Down
3 changes: 1 addition & 2 deletions src/global_ac.h → src/ac/global_ac.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
#include <fstream>
#include <algorithm>

#include "models.h"
#include "kmers.h"
#include "kmers_ac.h"
#include "ac_automaton.h"


Expand Down
44 changes: 44 additions & 0 deletions src/ac/kmers_ac.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#pragma once
#include <string>
#include <vector>

struct KMer {
std::string value;
size_t length() const{ return value.size(); }
};

/// Return the complementary nucleotide for the given one.
char ComplementaryNucleotide(const char nucleotide) {
if (nucleotide == 'A') return 'T';
else if (nucleotide == 'T') return 'A';
else if (nucleotide == 'G') return 'C';
else if (nucleotide == 'C') return 'G';
throw std::invalid_argument("cannot find complementary nucleotide for letter " + std::string(1, nucleotide));
}

/// Compute the reverse complement of the given k-mer.
KMer ReverseComplement(const KMer &kMer) {
KMer ans;
ans.value.reserve(kMer.length());
for (int i = (int)kMer.length() - 1; i >= 0; --i) {
ans.value += ComplementaryNucleotide(kMer.value[i]);
}
return ans;
}

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
int NucleotideToInt (char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
case 'a': return 0;
case 'c': return 1;
case 'g': return 2;
case 't': return 3;
default: return -1;
}
}

2 changes: 1 addition & 1 deletion src/local_ac.h → src/ac/local_ac.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <algorithm>
#include <fstream>

#include "models.h"
#include "kmers_ac.h"
#include "ac_automaton.h"

/// Find the index of the first extending k-mer from the incidentKMers which is not forbidden.
Expand Down
79 changes: 79 additions & 0 deletions src/ac/parser_ac.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#pragma once

// Implicitly initialize kseq.
#include "../parser.h"

#include "kmers_ac.h"

/// Record of one fasta sequence.
struct FastaRecord {
// The name of the record. Starts with '>'.
std::string name;
// The genetic sequence.
std::string sequence;
};

/// Read fasta file with given path.
std::vector<FastaRecord> ReadFasta(std::string &path) {
gzFile fp;
kseq_t *seq;
std::vector<FastaRecord> records;

fp = gzopen(path.c_str(), "r");
if (fp == nullptr) {
throw std::invalid_argument("couldn't open file " + path);
}
seq = kseq_init(fp);
while (kseq_read(seq) >= 0) {
records.push_back(FastaRecord{
seq->name.s,
seq->seq.s,
});
}
kseq_destroy(seq);
gzclose(fp);
return records;
}

/// Create a list of unique k-mers in no particular order.
/// This runs in O(k*data.size) expected time.
void AddKMersFromSequence(std::unordered_set<std::string> &kMers, std::string &data, int k) {
// Convert the sequence to uppercase letters.
std::transform(data.begin(), data.end(), data.begin(), toupper);
size_t possibleKMerEnd = k;
for (size_t i = 1; i <= data.size(); ++i) {
if (data[i-1] != 'A' && data[i-1] != 'C' && data[i-1] != 'G' && data[i-1] != 'T') {
// Skip this and the next k-1 k-mers.
possibleKMerEnd = i + k;
}
if (i >= possibleKMerEnd) {
kMers.insert(data.substr(i - k, k));
}
}
}

/// Return only the subset of k-mers where no two k-mers are complements of one another.
/// The k-mers are chosen with no particular property.
std::unordered_set<std::string> FilterKMersWithComplement(std::unordered_set<std::string> &kMers) {
std::unordered_set<std::string> ret;
for (auto &&x : kMers) {
auto kMer = KMer{x};
if (ret.count(ReverseComplement(kMer).value) == 0) ret.insert(x);
}
return ret;
}

/// Create a list of unique k-mers in no particular order.
/// This runs in O(k*data.size) expected time.
std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool complements) {
std::unordered_set<std::string> uniqueKMers;
for (auto &&record : data) {
AddKMersFromSequence(uniqueKMers, record.sequence, k);
}
if (complements) uniqueKMers = FilterKMersWithComplement(uniqueKMers);
std::vector<KMer> result;
for (auto &kMer : uniqueKMers) {
result.push_back(KMer{kMer});
}
return result;
}
2 changes: 1 addition & 1 deletion src/streaming.h → src/ac/streaming.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cassert>
#include <algorithm>

#include "kmers.h"
#include "../kmers.h"

void Push(std::ostream &of, const std::string &current, int k, int32_t used) {
if (current.size() == static_cast<size_t>(k) && used != 0) {
Expand Down
1 change: 0 additions & 1 deletion src/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <algorithm>
#include <cstdint>

#include "models.h"
#include "kmers.h"
#include "khash.h"
#include "khash_utils.h"
Expand Down
42 changes: 1 addition & 41 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <iostream>
#include <cstdint>

#include "models.h"
#include "ac/kmers_ac.h"

#ifdef LARGE_KMERS
typedef __uint128_t kmer_t;
Expand All @@ -14,22 +14,6 @@
constexpr int KMER_T_SIZE = 64;
#endif

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
int NucleotideToInt (char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
case 'a': return 0;
case 'c': return 1;
case 'g': return 2;
case 't': return 3;
default: return -1;
}
}


static const uint8_t nucleotideToInt[] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
Expand Down Expand Up @@ -105,32 +89,8 @@ kmer_t ReverseComplement(kmer_t kMer, int k) {
return (((kmer_t)word_reverse_complement(kMer)) >> (KMER_T_SIZE - (k << kmer_t(1)))) & ((kmer_t(1) << (k << kmer_t (1))) - kmer_t(1));
}

/// Return the complementary nucleotide for the given one.
char ComplementaryNucleotide(const char nucleotide) {
if (nucleotide == 'A') return 'T';
else if (nucleotide == 'T') return 'A';
else if (nucleotide == 'G') return 'C';
else if (nucleotide == 'C') return 'G';
throw std::invalid_argument("cannot find complementary nucleotide for letter " + std::string(1, nucleotide));
}

/// Compute the reverse complement of the given k-mer.
KMer ReverseComplement(const KMer &kMer) {
KMer ans;
ans.value.reserve(kMer.length());
for (int i = (int)kMer.length() - 1; i >= 0; --i) {
ans.value += ComplementaryNucleotide(kMer.value[i]);
}
return ans;
}

const char letters[4] {'A', 'C', 'G', 'T'};

/// Get the uppercase version of the character.
inline char UpperToLower(char c) {
return c - 'A' + 'a';
}

/// Return the index-th nucleotide from the encoded k-mer.
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
Expand Down
1 change: 0 additions & 1 deletion src/local.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <algorithm>
#include <fstream>

#include "models.h"
#include "kmers.h"
#include "khash_utils.h"

Expand Down
8 changes: 4 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

#include "unistd.h"
#include "version.h"
#include "global_ac.h"
#include "ac/global_ac.h"
#include "global.h"
#include "local.h"
#include "local_ac.h"
#include "ac/local_ac.h"
#include "parser.h"
#include "streaming.h"
#include "output.h"
#include "ac/parser_ac.h"
#include "ac/streaming.h"
#include "khash_utils.h"
#include "masks.h"

Expand Down
8 changes: 0 additions & 8 deletions src/models.h

This file was deleted.

11 changes: 0 additions & 11 deletions src/output.h

This file was deleted.

79 changes: 6 additions & 73 deletions src/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,82 +9,9 @@
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)

#include "models.h"
#include "kmers.h"
#include "khash_utils.h"

/// Record of one fasta sequence.
struct FastaRecord {
// The name of the record. Starts with '>'.
std::string name;
// The genetic sequence.
std::string sequence;
};

/// Read fasta file with given path.
std::vector<FastaRecord> ReadFasta(std::string &path) {
gzFile fp;
kseq_t *seq;
std::vector<FastaRecord> records;

fp = gzopen(path.c_str(), "r");
if (fp == nullptr) {
throw std::invalid_argument("couldn't open file " + path);
}
seq = kseq_init(fp);
while (kseq_read(seq) >= 0) {
records.push_back(FastaRecord{
seq->name.s,
seq->seq.s,
});
}
kseq_destroy(seq);
gzclose(fp);
return records;
}

/// Create a list of unique k-mers in no particular order.
/// This runs in O(k*data.size) expected time.
void AddKMersFromSequence(std::unordered_set<std::string> &kMers, std::string &data, int k) {
// Convert the sequence to uppercase letters.
std::transform(data.begin(), data.end(), data.begin(), toupper);
size_t possibleKMerEnd = k;
for (size_t i = 1; i <= data.size(); ++i) {
if (data[i-1] != 'A' && data[i-1] != 'C' && data[i-1] != 'G' && data[i-1] != 'T') {
// Skip this and the next k-1 k-mers.
possibleKMerEnd = i + k;
}
if (i >= possibleKMerEnd) {
kMers.insert(data.substr(i - k, k));
}
}
}

/// Return only the subset of k-mers where no two k-mers are complements of one another.
/// The k-mers are chosen with no particular property.
std::unordered_set<std::string> FilterKMersWithComplement(std::unordered_set<std::string> &kMers) {
std::unordered_set<std::string> ret;
for (auto &&x : kMers) {
auto kMer = KMer{x};
if (ret.count(ReverseComplement(kMer).value) == 0) ret.insert(x);
}
return ret;
}

/// Create a list of unique k-mers in no particular order.
/// This runs in O(k*data.size) expected time.
std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool complements) {
std::unordered_set<std::string> uniqueKMers;
for (auto &&record : data) {
AddKMersFromSequence(uniqueKMers, record.sequence, k);
}
if (complements) uniqueKMers = FilterKMersWithComplement(uniqueKMers);
std::vector<KMer> result;
for (auto &kMer : uniqueKMers) {
result.push_back(KMer{kMer});
}
return result;
}

/// Fill the k-mer dictionary with k-mers from the given sequence.
/// If complements is true, always add the canonical k-mers.
Expand Down Expand Up @@ -161,3 +88,9 @@ void AssertEOF(kseq_t *seq, std::string message) {
throw std::invalid_argument(message);
}
}

/// Print the fasta file header.
void WriteName(const int k, std::ostream &of) {
of << ">superstring ";
of << "k=" << k << std::endl;
}
2 changes: 1 addition & 1 deletion tests/ac_automaton_unittest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#include "../src/ac_automaton.h"
#include "../src/ac/ac_automaton.h"

#include "gtest/gtest.h"

Expand Down
2 changes: 1 addition & 1 deletion tests/global_ac_unittest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#include "../src/global_ac.h"
#include "../src/ac/global_ac.h"

#include "gtest/gtest.h"

Expand Down
1 change: 1 addition & 0 deletions tests/kmers_unittest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "../src/kmers.h"
#include "../src/ac/kmers_ac.h"

#include "gtest/gtest.h"

Expand Down
Loading

0 comments on commit b067518

Please sign in to comment.