Skip to content

Commit

Permalink
linear base groups and exact adapter counting, consistent with grep b…
Browse files Browse the repository at this point in the history
…ut not with fastqc, investigating why.
  • Loading branch information
guilhermesena1 committed Nov 26, 2019
1 parent d19d3ee commit 7cd0cd3
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 32 deletions.
4 changes: 4 additions & 0 deletions src/FalcoConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

#include "FalcoConfig.hpp"
#include "FastqStats.hpp"

#include <fstream>
#include <sstream>
Expand Down Expand Up @@ -249,6 +250,9 @@ FalcoConfig::read_adapters() {
// the biological sequence
while (getline(in, line)) {
if (is_content_line(line)) {
if (adapter_names.size() > FastqStats::max_adapters)
throw runtime_error("You are testing too many adapters. The maximum "
"number is 128!");
adapter_name = "";
adapter_seq = "";
istringstream iss(line);
Expand Down
1 change: 1 addition & 0 deletions src/FastqStats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ FastqStats::FastqStats() {
gc_count.fill(0);
position_quality_count.fill(0);
pos_kmer_count.fill(0);
pos_adapter_count.fill(0);
kmer_count = vector<size_t>(kNumBases*(kmer_mask + 1), 0);
}

Expand Down
29 changes: 25 additions & 4 deletions src/FastqStats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,17 +141,35 @@ struct FastqStats {
// Prefix size to cut if read length exceeds the value above
static const size_t kDupReadTruncateSize = 50;

// Bit shifts as instructions for the std::arrays
static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides);
static const size_t kBitShiftQuality = log2exact(kNumQualityValues);

/************ KMER CONSTANTS **********/
// Kmer size given as input
static const size_t kmer_size = 7;

// Bit shifts as instructions for the std::arrays
static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides);
static const size_t kBitShiftQuality = log2exact(kNumQualityValues);
static const size_t kBitShiftKmer = 2 * kmer_size; // two bits per base
// we shift 14 bits when reading a kmer, two bits per base
static const size_t kBitShiftKmer = 2 * kmer_size;

// mask to get only the first 2*k bits of the sliding window
static const size_t kmer_mask = (1ll << (2*kmer_size)) - 1;

/************ ADAPTER CONSTANTS **********/
// Kmer size given as input
static const size_t adapter_size = 12;

// Maximum number of adapters
static const size_t max_adapters = 128;

// bit shift for adapters, log(100) = 7
static const size_t kBitShiftAdapter = log2exact(max_adapters);

// mask to get only the first 2*k bits of the sliding window
static const size_t adapter_mask = (1ll << (2*adapter_size)) - 1;



public:
/*********** SINGLE NUMBERS FROM THE ENTIRE FASTQ ****************/
// Number of unique sequences seen thus far
Expand Down Expand Up @@ -215,6 +233,9 @@ struct FastqStats {
// How many kmers were counted in each position
std::array<size_t, kNumBases> pos_kmer_count;

// How many adapters were counted in each position
std::array<size_t, kNumBases> pos_adapter_count;

/*********** DUPLICATION ******************/
// First 100k unique sequences and how often they were seen
std::unordered_map <std::string, size_t> sequence_count;
Expand Down
110 changes: 94 additions & 16 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,22 @@ toupper(const string &s) {


/*****************************************************************************/
/******************* IMPLEMENTATION OF FASTQC FUNCTIONS **********************/
/******************* BASEGROUPS COPIED FROM FASTQC ***************************/
/*****************************************************************************/

/************* NO BASE GROUP ****************/
void
make_base_groups(vector<BaseGroup> &base_groups,
make_default_base_groups(vector<BaseGroup> &base_groups,
const size_t &num_bases) {
base_groups.clear();
for (size_t i = 0; i < num_bases; ++i)
base_groups.push_back(BaseGroup(i,i));
}


/************* EXP BASE GROUP **************/
void
make_exponential_base_groups(vector<BaseGroup> &base_groups,
const size_t &num_bases) {
size_t starting_base = 0,
end_base,
Expand All @@ -81,12 +93,73 @@ make_base_groups(vector<BaseGroup> &base_groups,
}
}


/************* LINEAR BASE GROUP *************/
// aux function to get linear interval
size_t
get_linear_interval(const size_t &num_bases) {
// The the first 9bp as individual residues since odd stuff
// can happen there, then we find a grouping value which gives
// us a total set of groups below 75. We limit the intervals
// we try to sensible whole numbers.
vector<size_t> baseValues = {2,5,10};
size_t multiplier = 1;
while (true) {
for (size_t b = 0; b<baseValues.size(); b++) {
size_t interval = baseValues[b] * multiplier;
size_t group_count = 9 + ((num_bases-9)/interval);

if ((num_bases-9) % interval != 0)
group_count++;

if (group_count < 75)
return interval;
}
multiplier *= 10;
if (multiplier == 10000000)
throw runtime_error("Couldn't find a sensible interval grouping for len ="
+ std::to_string(num_bases));
}
}


void
make_default_base_groups(vector<BaseGroup> &base_groups,
const size_t &num_bases) {
base_groups.clear();
for (size_t i = 0; i < num_bases; ++i)
base_groups.push_back(BaseGroup(i,i));
make_linear_base_groups(vector<BaseGroup> &base_groups,
const size_t num_bases) {

// For lengths below 75bp we just return everything.
if (num_bases <= 75) {
make_default_base_groups(base_groups, num_bases);
return;
}

// We need to work out what interval we're going to use.
const size_t interval = get_linear_interval(num_bases);
size_t starting_base = 1;

while (starting_base <= num_bases) {
size_t end_base = starting_base + interval - 1;

if (starting_base < 10)
end_base = starting_base;

if (starting_base == 10 && interval > 10)
end_base = interval - 1;

if (end_base > num_bases)
end_base = num_bases;

BaseGroup bg = BaseGroup(starting_base - 1, end_base - 1);
base_groups.push_back(bg);

if (starting_base < 10)
starting_base++;
else if (starting_base == 10 && interval > 10)
starting_base = interval;
else
starting_base += interval;

}
}

// FastQC extrapolation of counts to the full file size
Expand Down Expand Up @@ -473,7 +546,10 @@ ModulePerBaseSequenceQuality::summarize_module(const FastqStats &stats) {
num_bases = stats.max_read_length;

// first, do the groups
if (do_group) make_base_groups(base_groups, num_bases);
if (do_group) {
make_linear_base_groups(base_groups, num_bases);
}

else make_default_base_groups(base_groups, num_bases);
num_groups = base_groups.size();

Expand Down Expand Up @@ -1644,15 +1720,15 @@ ModuleAdapterContent::count_adapter(
for (size_t i = 0; i < num_slides; ++i) {
cur_pos = pos + (kmer_size - 1) + (num_slides - i - 1);
adapter_kmer = adapter_slide & kmer_mask;

// debug
kmer_count_index = (cur_pos << bit_shift_kmer) | adapter_kmer;
cnt = kmer_count[kmer_count_index];
if (adapter_count == 0)
adapter_count = cnt;
else if (cnt < adapter_count)
adapter_count = cnt;

// slide the adapter hash by 2 bits
adapter_slide >>= 2;
}
Expand All @@ -1675,12 +1751,11 @@ ModuleAdapterContent::summarize_module(const FastqStats &stats) {
for (size_t j = 0; j < num_bases; ++j) {
cnt = 0;
// check if position even makes sense
if (j + adapter_seqs[i].size() < num_bases)
cnt = count_adapter(stats.kmer_count,
j,
adapter_hashes[i],
adapter_seqs[i].size(),
stats.kmer_size);
if (j + adapter_seqs[i].size() - 1 < num_bases)
cnt = stats.pos_adapter_count[
((j + adapter_seqs[i].size() - 1) << stats.kBitShiftAdapter) | i
];

if (j == 0)
adapter_pos_pct[i][j] = cnt;
else
Expand Down Expand Up @@ -1862,3 +1937,6 @@ string
ModuleKmerContent::make_html_data() {
return "";
}



53 changes: 41 additions & 12 deletions src/StreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,41 @@
*/

#include "StreamReader.hpp"

#include <vector>
#include <cstring>

using std::string;
using std::vector;
using std::runtime_error;

using std::array;
/****************************************************/
/***************** STREAMREADER *********************/
/****************************************************/

// function to turn a vector into array for adapter hashes and fast lookup
array<size_t, FastqStats::max_adapters>
make_adapters(const vector<size_t> &adapter_hashes) {
if (adapter_hashes.size() > FastqStats::max_adapters)
throw runtime_error("Number of adapters is larger than 128, which hinders "
"visualziation and speed of falco. Please keep it to "
"under 128");

array<size_t, FastqStats::max_adapters> ans;
for (size_t i = 0; i < adapter_hashes.size(); ++i)
ans[i] = adapter_hashes[i];

return ans;
}

StreamReader::StreamReader(FalcoConfig &config,
const size_t _buffer_size,
const char _field_separator,
const char _line_separator) :
// I have to pass the config skips as const to read them fast
do_sequence_hash(config.do_duplication || config.do_overrepresented),
do_kmer(config.do_adapter || config.do_kmer),
do_kmer(config.do_kmer),
do_adapter(config.do_adapter),
do_sliding_window(do_adapter || do_kmer),
do_n_content(config.do_n_content),
do_quality_base(config.do_quality_base),
do_sequence(config.do_sequence),
Expand All @@ -43,7 +60,12 @@ StreamReader::StreamReader(FalcoConfig &config,
// Here are the usual stream reader configs
buffer_size(_buffer_size),
field_separator(_field_separator),
line_separator(_line_separator) {
line_separator(_line_separator),

// Here are the const adapters
num_adapters(config.adapter_hashes.size()),
adapters(make_adapters(config.adapter_hashes))
{

// Allocates buffer to temporarily store reads
buffer = new char[buffer_size + 1];
Expand Down Expand Up @@ -226,22 +248,29 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) {
stats.base_count[
(read_pos << stats.kBitShiftNucleotide) | base_ind]++;

if (do_kmer) {
if (do_sliding_window) {
// Update k-mer sequence
cur_kmer = ((cur_kmer << stats.kBitShiftNucleotide) | base_ind);

// registers k-mer if seen at least k nucleotides since the last n
if (num_bases_after_n == stats.kmer_size) {
if (do_kmer && (num_bases_after_n == stats.kmer_size)) {

stats.kmer_count[(read_pos << stats.kBitShiftKmer)
| (cur_kmer & stats.kmer_mask)]++;
stats.pos_kmer_count[read_pos]++;
stats.kmer_count[(read_pos << stats.kBitShiftKmer)
| (cur_kmer & stats.kmer_mask)]++;
stats.pos_kmer_count[read_pos]++;
}

// Otherwise register a new non-n base
else {
num_bases_after_n++;
// GS: slow, need to use fsm
if (do_adapter && (num_bases_after_n == stats.adapter_size)) {
cur_kmer &= stats.adapter_mask;
for (i = 0; i < num_adapters; ++i) {
if (cur_kmer == adapters[i]) {
stats.pos_adapter_count[(read_pos << stats.kBitShiftAdapter) | i]++;
}
}
}

num_bases_after_n += (num_bases_after_n != stats.adapter_size);
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions src/StreamReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class StreamReader{
// config on how to handle reads
const bool do_sequence_hash,
do_kmer,
do_adapter,
do_n_content,
do_quality_base,
do_sequence,
Expand All @@ -53,6 +54,7 @@ class StreamReader{
do_tile,
do_sequence_length;

const bool do_sliding_window;
bool continue_storing_sequences;
bool do_kmer_read;
bool do_tile_read;
Expand Down Expand Up @@ -107,6 +109,7 @@ class StreamReader{
size_t cur_quality; // Sum of quality values in read
size_t num_bases_after_n; // count of k-mers that reset at every N
size_t cur_kmer; // 32-mer hash as you pass through the sequence line
size_t i; // general iterator

// variables for gc model
GCModelValue value;
Expand All @@ -120,6 +123,9 @@ class StreamReader{
std::string sequence_to_hash; // sequence marked for duplication
std::string filename;

/************ ADAPTER SEARCH ***********/
const size_t num_adapters;
std::array<size_t, FastqStats::max_adapters> adapters;

/************ FUNCTIONS TO PROCESS READS AND BASES ***********/
// gets and puts bases from and to buffer
Expand Down

0 comments on commit 7cd0cd3

Please sign in to comment.