From 7cd0cd31aa0178dee7899fb10e2a5b83d793b963 Mon Sep 17 00:00:00 2001 From: guilhermesena1 Date: Mon, 25 Nov 2019 18:02:51 -0800 Subject: [PATCH] linear base groups and exact adapter counting, consistent with grep but not with fastqc, investigating why. --- src/FalcoConfig.cpp | 4 ++ src/FastqStats.cpp | 1 + src/FastqStats.hpp | 29 ++++++++++-- src/Module.cpp | 110 ++++++++++++++++++++++++++++++++++++------- src/StreamReader.cpp | 53 ++++++++++++++++----- src/StreamReader.hpp | 6 +++ 6 files changed, 171 insertions(+), 32 deletions(-) diff --git a/src/FalcoConfig.cpp b/src/FalcoConfig.cpp index 9d7d8cb..71e7a07 100644 --- a/src/FalcoConfig.cpp +++ b/src/FalcoConfig.cpp @@ -14,6 +14,7 @@ */ #include "FalcoConfig.hpp" +#include "FastqStats.hpp" #include #include @@ -249,6 +250,9 @@ FalcoConfig::read_adapters() { // the biological sequence while (getline(in, line)) { if (is_content_line(line)) { + if (adapter_names.size() > FastqStats::max_adapters) + throw runtime_error("You are testing too many adapters. The maximum " + "number is 128!"); adapter_name = ""; adapter_seq = ""; istringstream iss(line); diff --git a/src/FastqStats.cpp b/src/FastqStats.cpp index c9198db..2c29fd6 100644 --- a/src/FastqStats.cpp +++ b/src/FastqStats.cpp @@ -65,6 +65,7 @@ FastqStats::FastqStats() { gc_count.fill(0); position_quality_count.fill(0); pos_kmer_count.fill(0); + pos_adapter_count.fill(0); kmer_count = vector(kNumBases*(kmer_mask + 1), 0); } diff --git a/src/FastqStats.hpp b/src/FastqStats.hpp index 524a4a0..a179da2 100644 --- a/src/FastqStats.hpp +++ b/src/FastqStats.hpp @@ -141,17 +141,35 @@ struct FastqStats { // Prefix size to cut if read length exceeds the value above static const size_t kDupReadTruncateSize = 50; + // Bit shifts as instructions for the std::arrays + static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides); + static const size_t kBitShiftQuality = log2exact(kNumQualityValues); + + /************ KMER CONSTANTS **********/ // Kmer size given as input static const size_t kmer_size = 7; - // Bit shifts as instructions for the std::arrays - static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides); - static const size_t kBitShiftQuality = log2exact(kNumQualityValues); - static const size_t kBitShiftKmer = 2 * kmer_size; // two bits per base + // we shift 14 bits when reading a kmer, two bits per base + static const size_t kBitShiftKmer = 2 * kmer_size; // mask to get only the first 2*k bits of the sliding window static const size_t kmer_mask = (1ll << (2*kmer_size)) - 1; + /************ ADAPTER CONSTANTS **********/ + // Kmer size given as input + static const size_t adapter_size = 12; + + // Maximum number of adapters + static const size_t max_adapters = 128; + + // bit shift for adapters, log(100) = 7 + static const size_t kBitShiftAdapter = log2exact(max_adapters); + + // mask to get only the first 2*k bits of the sliding window + static const size_t adapter_mask = (1ll << (2*adapter_size)) - 1; + + + public: /*********** SINGLE NUMBERS FROM THE ENTIRE FASTQ ****************/ // Number of unique sequences seen thus far @@ -215,6 +233,9 @@ struct FastqStats { // How many kmers were counted in each position std::array pos_kmer_count; + // How many adapters were counted in each position + std::array pos_adapter_count; + /*********** DUPLICATION ******************/ // First 100k unique sequences and how often they were seen std::unordered_map sequence_count; diff --git a/src/Module.cpp b/src/Module.cpp index 0b39988..ea11908 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -51,10 +51,22 @@ toupper(const string &s) { /*****************************************************************************/ -/******************* IMPLEMENTATION OF FASTQC FUNCTIONS **********************/ +/******************* BASEGROUPS COPIED FROM FASTQC ***************************/ /*****************************************************************************/ + +/************* NO BASE GROUP ****************/ void -make_base_groups(vector &base_groups, +make_default_base_groups(vector &base_groups, + const size_t &num_bases) { + base_groups.clear(); + for (size_t i = 0; i < num_bases; ++i) + base_groups.push_back(BaseGroup(i,i)); +} + + +/************* EXP BASE GROUP **************/ +void +make_exponential_base_groups(vector &base_groups, const size_t &num_bases) { size_t starting_base = 0, end_base, @@ -81,12 +93,73 @@ make_base_groups(vector &base_groups, } } + +/************* LINEAR BASE GROUP *************/ +// aux function to get linear interval +size_t +get_linear_interval(const size_t &num_bases) { + // The the first 9bp as individual residues since odd stuff + // can happen there, then we find a grouping value which gives + // us a total set of groups below 75. We limit the intervals + // we try to sensible whole numbers. + vector baseValues = {2,5,10}; + size_t multiplier = 1; + while (true) { + for (size_t b = 0; b #include using std::string; using std::vector; using std::runtime_error; - +using std::array; /****************************************************/ /***************** STREAMREADER *********************/ /****************************************************/ + +// function to turn a vector into array for adapter hashes and fast lookup +array +make_adapters(const vector &adapter_hashes) { + if (adapter_hashes.size() > FastqStats::max_adapters) + throw runtime_error("Number of adapters is larger than 128, which hinders " + "visualziation and speed of falco. Please keep it to " + "under 128"); + + array ans; + for (size_t i = 0; i < adapter_hashes.size(); ++i) + ans[i] = adapter_hashes[i]; + + return ans; +} + StreamReader::StreamReader(FalcoConfig &config, const size_t _buffer_size, const char _field_separator, const char _line_separator) : // I have to pass the config skips as const to read them fast do_sequence_hash(config.do_duplication || config.do_overrepresented), - do_kmer(config.do_adapter || config.do_kmer), + do_kmer(config.do_kmer), + do_adapter(config.do_adapter), + do_sliding_window(do_adapter || do_kmer), do_n_content(config.do_n_content), do_quality_base(config.do_quality_base), do_sequence(config.do_sequence), @@ -43,7 +60,12 @@ StreamReader::StreamReader(FalcoConfig &config, // Here are the usual stream reader configs buffer_size(_buffer_size), field_separator(_field_separator), - line_separator(_line_separator) { + line_separator(_line_separator), + + // Here are the const adapters + num_adapters(config.adapter_hashes.size()), + adapters(make_adapters(config.adapter_hashes)) + { // Allocates buffer to temporarily store reads buffer = new char[buffer_size + 1]; @@ -226,22 +248,29 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) { stats.base_count[ (read_pos << stats.kBitShiftNucleotide) | base_ind]++; - if (do_kmer) { + if (do_sliding_window) { // Update k-mer sequence cur_kmer = ((cur_kmer << stats.kBitShiftNucleotide) | base_ind); // registers k-mer if seen at least k nucleotides since the last n - if (num_bases_after_n == stats.kmer_size) { + if (do_kmer && (num_bases_after_n == stats.kmer_size)) { - stats.kmer_count[(read_pos << stats.kBitShiftKmer) - | (cur_kmer & stats.kmer_mask)]++; - stats.pos_kmer_count[read_pos]++; + stats.kmer_count[(read_pos << stats.kBitShiftKmer) + | (cur_kmer & stats.kmer_mask)]++; + stats.pos_kmer_count[read_pos]++; } - // Otherwise register a new non-n base - else { - num_bases_after_n++; + // GS: slow, need to use fsm + if (do_adapter && (num_bases_after_n == stats.adapter_size)) { + cur_kmer &= stats.adapter_mask; + for (i = 0; i < num_adapters; ++i) { + if (cur_kmer == adapters[i]) { + stats.pos_adapter_count[(read_pos << stats.kBitShiftAdapter) | i]++; + } + } } + + num_bases_after_n += (num_bases_after_n != stats.adapter_size); } } } diff --git a/src/StreamReader.hpp b/src/StreamReader.hpp index 1e55406..e87d977 100644 --- a/src/StreamReader.hpp +++ b/src/StreamReader.hpp @@ -45,6 +45,7 @@ class StreamReader{ // config on how to handle reads const bool do_sequence_hash, do_kmer, + do_adapter, do_n_content, do_quality_base, do_sequence, @@ -53,6 +54,7 @@ class StreamReader{ do_tile, do_sequence_length; + const bool do_sliding_window; bool continue_storing_sequences; bool do_kmer_read; bool do_tile_read; @@ -107,6 +109,7 @@ class StreamReader{ size_t cur_quality; // Sum of quality values in read size_t num_bases_after_n; // count of k-mers that reset at every N size_t cur_kmer; // 32-mer hash as you pass through the sequence line + size_t i; // general iterator // variables for gc model GCModelValue value; @@ -120,6 +123,9 @@ class StreamReader{ std::string sequence_to_hash; // sequence marked for duplication std::string filename; + /************ ADAPTER SEARCH ***********/ + const size_t num_adapters; + std::array adapters; /************ FUNCTIONS TO PROCESS READS AND BASES ***********/ // gets and puts bases from and to buffer