From e1271c99ae30ed2ae37ae92bce1d4ae9309f24e0 Mon Sep 17 00:00:00 2001 From: guilhermesena1 Date: Tue, 26 Nov 2019 14:58:06 -0800 Subject: [PATCH] text output shows only num_bases - adapter_size bases --- src/FalcoConfig.cpp | 8 +++++ src/FalcoConfig.hpp | 1 + src/FastqStats.hpp | 7 ----- src/Module.cpp | 74 +++++++------------------------------------- src/Module.hpp | 3 ++ src/StreamReader.cpp | 12 ++++--- src/StreamReader.hpp | 4 ++- 7 files changed, 34 insertions(+), 75 deletions(-) diff --git a/src/FalcoConfig.cpp b/src/FalcoConfig.cpp index 71e7a07..b3c6589 100644 --- a/src/FalcoConfig.cpp +++ b/src/FalcoConfig.cpp @@ -248,6 +248,8 @@ FalcoConfig::read_adapters() { // The adapters file has a space separated name, and the last instance is // the biological sequence + + adapter_size = 0; while (getline(in, line)) { if (is_content_line(line)) { if (adapter_names.size() > FastqStats::max_adapters) @@ -284,6 +286,12 @@ FalcoConfig::read_adapters() { adapter_names.push_back(adapter_name); adapter_seqs.push_back(adapter_seq); adapter_hashes.push_back(adapter_hash); + + if (adapter_size == 0) + adapter_size = adapter_seq.size(); + else if (adapter_seq.size() != adapter_size) + throw runtime_error("adapters in config are not all of same size"); + line_by_space.clear(); } } diff --git a/src/FalcoConfig.hpp b/src/FalcoConfig.hpp index d967664..e8b8a69 100644 --- a/src/FalcoConfig.hpp +++ b/src/FalcoConfig.hpp @@ -93,6 +93,7 @@ struct FalcoConfig { // two-bit hash of the sequence above std::vector adapter_hashes; + size_t adapter_size; /************************************************************ ******* ADDITIONAL INFORMATION ABOUT THE SAMPLE ************ ************************************************************/ diff --git a/src/FastqStats.hpp b/src/FastqStats.hpp index a179da2..0fc23d9 100644 --- a/src/FastqStats.hpp +++ b/src/FastqStats.hpp @@ -156,19 +156,12 @@ struct FastqStats { static const size_t kmer_mask = (1ll << (2*kmer_size)) - 1; /************ ADAPTER CONSTANTS **********/ - // Kmer size given as input - static const size_t adapter_size = 12; - // Maximum number of adapters static const size_t max_adapters = 128; // bit shift for adapters, log(100) = 7 static const size_t kBitShiftAdapter = log2exact(max_adapters); - // mask to get only the first 2*k bits of the sliding window - static const size_t adapter_mask = (1ll << (2*adapter_size)) - 1; - - public: /*********** SINGLE NUMBERS FROM THE ENTIRE FASTQ ****************/ diff --git a/src/Module.cpp b/src/Module.cpp index ea11908..f1a328a 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -1680,60 +1680,8 @@ Module(ModuleAdapterContent::module_name) { auto grade_adapter = config.limits.find("adapter")->second; grade_warn = grade_adapter.find("warn")->second; grade_error = grade_adapter.find("error")->second; -} - -// Function to estimate number of adapter counts in a position, basically takes -// the minimum of all adapter kmers in adjacent positions to the "pos" paramter -// and return the minimum of them. -double -ModuleAdapterContent::count_adapter( - const vector &kmer_count, - const size_t pos, - const size_t adapter_hash, - const size_t adapter_size, - const size_t kmer_size) { - - // mask to just get the k bases from the adapter - size_t kmer_mask = (1ll << (2*kmer_size)) - 1; - - size_t bit_shift_kmer = 2*kmer_size; - - // minimum of all adapter kmers counted - size_t adapter_count = 0; - // temp variable to get the position in the flattened matrix of each kmer - size_t kmer_count_index; - - // number of sliding window slides we need to do - size_t num_slides = adapter_size - kmer_size + 1; - - // temp variable to get the sliding kmers in the adapter - size_t adapter_kmer; - - size_t adapter_slide = adapter_hash; - - // temp variable - size_t cur_pos; - - // temp variable to get the adapter count - size_t cnt; - for (size_t i = 0; i < num_slides; ++i) { - cur_pos = pos + (kmer_size - 1) + (num_slides - i - 1); - adapter_kmer = adapter_slide & kmer_mask; - - // debug - kmer_count_index = (cur_pos << bit_shift_kmer) | adapter_kmer; - cnt = kmer_count[kmer_count_index]; - if (adapter_count == 0) - adapter_count = cnt; - else if (cnt < adapter_count) - adapter_count = cnt; - - // slide the adapter hash by 2 bits - adapter_slide >>= 2; - } - - return static_cast(adapter_count); + adapter_size = config.adapter_size; } void @@ -1744,11 +1692,13 @@ ModuleAdapterContent::summarize_module(const FastqStats &stats) { num_bases = FastqStats::kNumBases; for (size_t i = 0; i < num_adapters; ++i) - adapter_pos_pct.push_back(vector(num_bases, 0.0)); + adapter_pos_pct.push_back( + vector(num_bases - adapter_size + 1, 0.0) + ); size_t cnt; - for (size_t i = 0; i < num_adapters; ++i) { - for (size_t j = 0; j < num_bases; ++j) { + for (size_t i = 0; i < adapter_pos_pct.size(); ++i) { + for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) { cnt = 0; // check if position even makes sense if (j + adapter_seqs[i].size() - 1 < num_bases) @@ -1764,8 +1714,8 @@ ModuleAdapterContent::summarize_module(const FastqStats &stats) { } // now convert the counts we got before to percentage - for (size_t i = 0; i < num_adapters; ++i) { - for (size_t j = 0; j < num_bases; ++j) { + for (size_t i = 0; i < adapter_pos_pct.size(); ++i) { + for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) { adapter_pos_pct[i][j] *= 100.0; adapter_pos_pct[i][j] /= static_cast(stats.num_reads); } @@ -1774,8 +1724,8 @@ ModuleAdapterContent::summarize_module(const FastqStats &stats) { void ModuleAdapterContent::make_grade() { - for (size_t i = 0; i < num_adapters; ++i) { - for (size_t j = 0; j < num_bases; ++j) { + for (size_t i = 0; i < adapter_pos_pct.size(); ++i) { + for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) { if (grade != "fail") { if (adapter_pos_pct[i][j] > grade_error) { grade = "fail"; @@ -1797,9 +1747,9 @@ ModuleAdapterContent::write_module(ostream &os) { os << "\n"; // matrix of percentages - for (size_t i = 0; i < num_bases; ++i) { + for (size_t i = 0; i < adapter_pos_pct[0].size(); ++i) { os << i + 1; - for (size_t j = 0; j < num_adapters; ++j) + for (size_t j = 0; j < adapter_pos_pct.size(); ++j) os << "\t" << adapter_pos_pct[j][i]; os << "\n"; } diff --git a/src/Module.hpp b/src/Module.hpp index 4ec9de2..f077027 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -316,6 +316,9 @@ class ModuleAdapterContent : public Module { // number of bases to report size_t num_bases; + // adapter size to know how many bases to report + size_t adapter_size; + // Information from config std::vector adapter_names; std::vector adapter_seqs; diff --git a/src/StreamReader.cpp b/src/StreamReader.cpp index 1a95f61..7d53ff3 100644 --- a/src/StreamReader.cpp +++ b/src/StreamReader.cpp @@ -64,7 +64,9 @@ StreamReader::StreamReader(FalcoConfig &config, // Here are the const adapters num_adapters(config.adapter_hashes.size()), - adapters(make_adapters(config.adapter_hashes)) + adapters(make_adapters(config.adapter_hashes)), + adapter_size(config.adapter_size), + adapter_mask((1ll << (2*adapter_size)) - 1) { // Allocates buffer to temporarily store reads @@ -261,16 +263,16 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) { } // GS: slow, need to use fsm - if (do_adapter && (num_bases_after_n == stats.adapter_size)) { - cur_kmer &= stats.adapter_mask; - for (i = 0; i < num_adapters; ++i) { + if (do_adapter && (num_bases_after_n == adapter_size)) { + cur_kmer &= adapter_mask; + for (i = 0; i != num_adapters; ++i) { if (cur_kmer == adapters[i]) { stats.pos_adapter_count[(read_pos << stats.kBitShiftAdapter) | i]++; } } } - num_bases_after_n += (num_bases_after_n != stats.adapter_size); + num_bases_after_n += (num_bases_after_n != adapter_size); } } } diff --git a/src/StreamReader.hpp b/src/StreamReader.hpp index e87d977..66e30f1 100644 --- a/src/StreamReader.hpp +++ b/src/StreamReader.hpp @@ -125,7 +125,9 @@ class StreamReader{ /************ ADAPTER SEARCH ***********/ const size_t num_adapters; - std::array adapters; + const size_t adapter_size; + const size_t adapter_mask; + const std::array adapters; /************ FUNCTIONS TO PROCESS READS AND BASES ***********/ // gets and puts bases from and to buffer