Skip to content

Commit

Permalink
Merge pull request #60 from OndrejSladky/optimize-masks
Browse files Browse the repository at this point in the history
KmerCamel can optimize masks.
  • Loading branch information
OndrejSladky authored Dec 31, 2023
2 parents 900903f + e9e5c02 commit bc641bb
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 12 deletions.
5 changes: 5 additions & 0 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ KMer ReverseComplement(const KMer &kMer) {

const char letters[4] {'A', 'C', 'G', 'T'};

/// Get the uppercase version of the character.
inline char UpperToLower(char c) {
return c - 'A' + 'a';
}

/// Return the index-th nucleotide from the encoded k-mer.
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
Expand Down
28 changes: 28 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "streaming.h"
#include "output.h"
#include "khash_utils.h"
#include "masks.h"

#include <iostream>
#include <string>
Expand Down Expand Up @@ -36,6 +37,16 @@ void Help() {
std::cerr << " -v - print version" << std::endl;
std::cerr << "Example usage: ./kmercamel -p path_to_fasta -k 13 -d 5 -a local" << std::endl;
std::cerr << "Possible algorithms: global globalAC local localAC streaming" << std::endl;
std::cerr << std::endl;
std::cerr << "For optimization of masks use `kmercamel optimize`." << std::endl;
std::cerr << "Accepted arguments:" << std::endl;
std::cerr << " -p path_to_fasta - required; valid path to fasta file" << std::endl;
std::cerr << " -k k_value - required; integer value for k" << std::endl;
std::cerr << " -a algorithm - the algorithm to be run [global (default), globalAC, local, localAC, streaming]" << std::endl;
std::cerr << " -o output_path - if not specified, the output is printed to stdout" << std::endl;
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
std::cerr << " -h - print help" << std::endl;
std::cerr << " -v - print version" << std::endl;
}

void Version() {
Expand All @@ -48,7 +59,14 @@ int main(int argc, char **argv) {
int d_max = 5;
std::ofstream output;
std::ostream *of = &std::cout;
bool masks = false;
std::string algorithm = "global";
if (argc > 1 && std::string(argv[1]) == "optimize") {
masks = true;
argv++;
argc--;
algorithm = "ones";
}
bool complements = false;
bool optimize_memory = true;
bool d_set = false;
Expand Down Expand Up @@ -126,6 +144,16 @@ int main(int argc, char **argv) {
std::cerr << "Memory optimization turn-off only supported for hash table global." << std::endl;
Help();
return 1;
} else if (masks && (d_set || !optimize_memory)) {
std::cerr << "Not supported flags for optimize." << std::endl;
Help();
return 1;
}

if (masks) {
int ret = Optimize(algorithm, path, *of, k, complements);
if (ret) Help();
return ret;
}

// Handle streaming algorithm separately.
Expand Down
95 changes: 95 additions & 0 deletions src/masks.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#pragma once

#include <string>
#include <iostream>

#include "parser.h"
#include "khash_utils.h"
#include "kmers.h"

/// For the given masked superstring output the same superstring with mask with minimal/maximal number of ones.
void OptimizeOnes(std::ifstream &in, std::ostream &of, kh_S64_t *kMers, int k, bool complements, bool minimize) {
char c;
int beforeKMerEnd = k;
kmer_t currentKMer = 0;
kmer_t mask = (((kmer_t) 1) << (2 * k) ) - 1;
bool readingHeader = false;
while (in >> std::noskipws >> c) {
// Start of a header.
if (c == '>') {
readingHeader = true;
currentKMer &= mask >> 1;
for (char ch : NumberToKMer(currentKMer, k - beforeKMerEnd)) of << UpperToLower(ch);
currentKMer = 0;
beforeKMerEnd = k;
}
// Reprint the header.
if (readingHeader) of << c;
if (c == '\n') readingHeader = false;
if (readingHeader) continue;
auto data = NucleotideToInt(c);
// Disregard white space.
if (c == '\n' || c == '\r' || c == ' ') continue;
// Break at an unknown character.
if (data == -1) {
currentKMer &= mask >> 1;
for (char ch : NumberToKMer(currentKMer, k - beforeKMerEnd)) of << UpperToLower(ch);
currentKMer = 0;
beforeKMerEnd = k;
continue;
}
currentKMer <<= 2;
currentKMer &= mask;
currentKMer |= data;
--beforeKMerEnd;
kmer_t reverseComplement = ReverseComplement(currentKMer, k);
if (beforeKMerEnd == 0) {
++beforeKMerEnd;
char to_print = NucleotideAtIndex(currentKMer, k, 0);
auto key = kh_get_S64(kMers, currentKMer);
if (key != kh_end(kMers)) {
if (minimize) {
kh_del_S64(kMers, key);
}
} else if (complements && (key = kh_get_S64(kMers, reverseComplement)) != kh_end(kMers)) {
if (minimize) {
kh_del_S64(kMers, key);
}
} else {
to_print = UpperToLower(to_print);
}
of << to_print;
}
}
currentKMer &= mask >> 1;
for (char ch : NumberToKMer(currentKMer, k - beforeKMerEnd)) of << UpperToLower(ch);
of << "\n";
}

int Optimize(std::string &algorithm, std::string path, std::ostream &of, int k, bool complements) {
kh_S64_t *kMers = kh_init_S64();
ReadKMers(kMers, path, k, complements, true);

std::ifstream in(path);
if (in.is_open()) {
if (algorithm == "ones") {
OptimizeOnes(in, of, kMers, k, complements, false);
} else if (algorithm == "zeros") {
OptimizeOnes(in, of, kMers, k, complements, true);
} else {
if (algorithm == "runs") {
std::cerr
<< "Minimization of the number of runs of ones is not supported. Please use the Python script from supplement."
<< std::endl;
} else {
std::cerr << "Algorithm '" + algorithm + "' not recognized." << std::endl;
}
in.close();
return 1;
}
in.close();
} else {
throw std::invalid_argument("couldn't open file " + path);
}
return 0;
}
11 changes: 7 additions & 4 deletions src/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,13 @@ std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool com
/// Return unique k-mers in no particular order.
/// If complements is set to true, the result contains only one of the complementary k-mers - it is not guaranteed which one.
/// This runs in O(sequence length) expected time.
void ReadKMers(kh_S64_t *kMers, std::string &path, int k, bool complements) {
void ReadKMers(kh_S64_t *kMers, std::string &path, int k, bool complements, bool case_sensitive = false) {
std::ifstream fasta(path);
std::string sequence;
std::string line;
if (fasta.is_open()) {
char c;
int beforeKMerEnd = k;
kmer_t currentKMer = 0;
kmer_t cases = 0;
kmer_t mask = (((kmer_t) 1) << (2 * k) ) - 1;
bool readingHeader = false;
while (fasta >> std::noskipws >> c) {
Expand All @@ -119,12 +118,16 @@ void ReadKMers(kh_S64_t *kMers, std::string &path, int k, bool complements) {
continue;
}
currentKMer <<= 2;
// K-mer is present if it is upper case or case-insensitive.
cases |= !case_sensitive || c <= 'Z';
cases <<= 1;
currentKMer &= mask;
currentKMer |= data;
if(beforeKMerEnd > 0) --beforeKMerEnd;
if (beforeKMerEnd == 0 && (!complements || kh_get_S64(kMers, ReverseComplement(currentKMer, k)) == kh_end(kMers))) {
int ret;
kh_put_S64(kMers, currentKMer, &ret);
// If the k-mer was masked as present.
if (cases & (1 << k)) kh_put_S64(kMers, currentKMer, &ret);
}
}
fasta.close();
Expand Down
68 changes: 68 additions & 0 deletions tests/masks_unittest.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#pragma once
#include "../src/masks.h"

#include <algorithm>
#include <vector>
#include <string>
#include <filesystem>

#include "gtest/gtest.h"

namespace {

// Retrieving current path on Windows does not work as on linux
// therefore the following unittest is linux-specific.
#ifdef __unix__

TEST(Masks, OptimizeOnes) {
std::string path = std::filesystem::current_path();
path += "/tests/testdata/masktest.fa";

struct TestCase {
std::vector<kmer_t> kMers;
int k;
bool complements;
bool minimize;
std::string wantResult;
};
std::vector<TestCase> tests = {
{
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
3, false, true,
"> superstring\nACgTaacgt\n"
},
{
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
3, false, false,
"> superstring\nACgTaACgt\n"
},
{
{KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
3, true, true,
"> superstring\nAcgTaacgt\n"
},
{
{KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
3, true, false,
"> superstring\nACgTaACgt\n"
},
};

for (auto &t : tests) {
std::stringstream of;
std::ifstream in(path);
if (!in.is_open()) {
throw std::invalid_argument("couldn't open file " + path);
}
auto kMersDict = kh_init_S64();
int ret;
for (auto &kMer : t.kMers) kh_put_S64(kMersDict, kMer, &ret);

OptimizeOnes(in, of, kMersDict, t.k, t.complements, t.minimize);
in.close();

EXPECT_EQ(t.wantResult, of.str());
}
}
#endif
}
16 changes: 9 additions & 7 deletions tests/parser_unittest.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace {
std::vector<FastaRecord> wantResult = {
FastaRecord{"1", "ACCCGAAC"},
FastaRecord{"2", "CGTANATGC"},
FastaRecord{"3", "ACCCGTTTAACG"},
FastaRecord{"3", "AcCCGTTTAACG"},
FastaRecord{"4", "A"},
};

Expand All @@ -38,20 +38,22 @@ namespace {
struct TestCase {
int k;
bool complements;
bool case_sensitive;
size_t wantResultSize;
};
std::vector<TestCase> tests = {
{10, true, 3},
{5, true, 11},
{5, false, 11},
{2, true, 9},
{2, false, 11},
{10, true, false,3},
{10, true, true,2},
{5, true, false, 11},
{5, false, false, 11},
{2, true, false, 9},
{2, false, false, 11},
};

for (auto &t: tests) {
auto kMers = kh_init_S64();

ReadKMers(kMers, path, t.k, t.complements);
ReadKMers(kMers, path, t.k, t.complements, t.case_sensitive);

EXPECT_EQ(t.wantResultSize, kh_size(kMers));
}
Expand Down
2 changes: 2 additions & 0 deletions tests/testdata/masktest.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
> superstring
ACGTAACGT
2 changes: 1 addition & 1 deletion tests/testdata/test.fa
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ AC
>2
CGTANATGC
>3 with description
ACC
AcC
CGT
TTA
ACG
Expand Down
1 change: 1 addition & 0 deletions tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "local_ac_unittest.h"
#include "streaming_unittest.h"
#include "ac_automaton_unittest.h"
#include "masks_unittest.h"

#include "gtest/gtest.h"

Expand Down

0 comments on commit bc641bb

Please sign in to comment.