From 7fc3556ed639c33508218495fa13d969f8ed75f2 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 18 Sep 2024 12:51:17 -0700 Subject: [PATCH 1/2] src/StreamReader.hpp and src/StreamReader.cpp: added a bitset with entries corresponding to the adapter sequences to act as indicator for adaptor found or not. This is needed for adapters that are self-overlapping and we want to avoid finding the same one multiple times in a read. The adapter_found vector is reset with each new read in the StreamReader::read_sequence_line base class function. --- src/StreamReader.cpp | 4 +++- src/StreamReader.hpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/StreamReader.cpp b/src/StreamReader.cpp index 92387fb..d3a65c2 100644 --- a/src/StreamReader.cpp +++ b/src/StreamReader.cpp @@ -344,9 +344,10 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) { if (do_adapter_optimized && (num_bases_after_n == adapter_size)) { cur_kmer &= adapter_mask; for (size_t i = 0; i != num_adapters; ++i) { - if (cur_kmer == adapters[i]) { + if (cur_kmer == adapters[i] && !adapters_found[i]) { ++stats.pos_adapter_count[ (read_pos << Constants::bit_shift_adapter) | i]; + adapters_found[i] = true; } } } @@ -436,6 +437,7 @@ StreamReader::read_sequence_line(FastqStats &stats) { still_in_buffer = true; next_truncation = 100; do_kmer_read = (stats.num_reads == next_kmer_read); + adapters_found.reset(); if (do_adapters_slow) { const string seq_line_str = cur_char; diff --git a/src/StreamReader.hpp b/src/StreamReader.hpp index c7c47ca..5282ef4 100644 --- a/src/StreamReader.hpp +++ b/src/StreamReader.hpp @@ -18,6 +18,7 @@ #include #include +#include // Optional zlib usage #include @@ -81,6 +82,7 @@ class StreamReader{ const size_t adapter_size; const size_t adapter_mask; const std::array adapters; + std::bitset adapters_found{}; const std::string filename; @@ -250,7 +252,7 @@ class BamReader : public StreamReader { bool read_entry(FastqStats &stats, size_t &num_bytes_read); // Specially made for BamReader to work directly with bam1_t - inline void read_sequence_line(FastqStats &stats); + inline void read_sequence_line(FastqStats &stats); inline void read_quality_line(FastqStats &stats); // parse quality inline void put_base_in_buffer(const size_t pos); // puts base in buffer or leftover ~BamReader(); From 165486d5a72ddc306d0455eb22915e44e669256f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 18 Sep 2024 14:09:43 -0700 Subject: [PATCH 2/2] test/md5sum.txt: updating the md5sum.txt file with hashes for files after fixing the multiple matching adapters --- test/md5sum.txt | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/test/md5sum.txt b/test/md5sum.txt index 15b319f..f9d5b13 100644 --- a/test/md5sum.txt +++ b/test/md5sum.txt @@ -1,30 +1,30 @@ -9641ae08f984bde102d4292bcad56484 test_output/SRR1853178_1/fastqc_data.txt +5c4e6118d438b1f01cf620120f26c622 test_output/SRR1853178_1/fastqc_data.txt 36df1dcab539ba4ef885239fc8524636 test_output/SRR1853178_1/summary.txt -c5b1d892705ee40353a5b88df5fc3a74 test_output/SRR3897196_2/fastqc_data.txt +8dfecfc49e5b450152be12c63177054f test_output/SRR3897196_2/fastqc_data.txt 80cd130958bcb2219f1e5a07d06a3b6e test_output/SRR3897196_2/summary.txt -d60663e26511968865b3ed92a864fda8 test_output/SRR9624732_1/fastqc_data.txt +5df2ae98f4389616df1fa90fe46ce463 test_output/SRR9624732_1/fastqc_data.txt a525d455b17eb6ba84cd2a60d281a2b0 test_output/SRR9624732_1/summary.txt -053bb7a28541ac110116086ac6ea0cac test_output/SRR10143153_2/fastqc_data.txt +6e5eaadf209c3f77ab61ed61a034f49e test_output/SRR10143153_2/fastqc_data.txt 19f1811f324e4c44154f659bb6e22806 test_output/SRR10143153_2/summary.txt -71e3964a9a610ebcb3207ea2a391b4be test_output/SRR1772703.lite.1/fastqc_data.txt -6403659cb0295ec05db9891e3e5e4f76 test_output/SRR1772703.lite.1/summary.txt -8d28c215efc4f2930a002c240fb14448 test_output/SRR9624732_2/fastqc_data.txt +427099afa91d877f078f1e7989fe4c39 test_output/SRR1772703.lite.1/fastqc_data.txt +ad5727295e7c8de6eb6874837bf1518f test_output/SRR1772703.lite.1/summary.txt +04f7bb98120971c0ba1d648fda893a7c test_output/SRR9624732_2/fastqc_data.txt fefc5d746f853c14b5e00421ad1ec260 test_output/SRR9624732_2/summary.txt -3942d27ab09f2db7f58a2ed4c904ac8a test_output/SRR10124060/fastqc_data.txt -1e228b1bb498eec2ca81ec2cc657a02d test_output/SRR10124060/summary.txt -7c5f40be6a37ac4d99c39400b1f7ddb6 test_output/SRR891268_2/fastqc_data.txt +a7827fe2115ef6ac48cd61f2065e58e5 test_output/SRR10124060/fastqc_data.txt +776f7d1b53bbed8683de9ca1d2529f1e test_output/SRR10124060/summary.txt +7c05da833c8806ea8d5093ddab337f1c test_output/SRR891268_2/fastqc_data.txt 20a8e50baace4c672622793874a3d7de test_output/SRR891268_2/summary.txt -e02e6043667560aacb39b4e956edc146 test_output/SRR9878537.lite.1/fastqc_data.txt +9bae57f4fa64d9fca4b11f0e0c107327 test_output/SRR9878537.lite.1/fastqc_data.txt e5c40997d4993c07e164ee5598c39cf9 test_output/SRR9878537.lite.1/summary.txt -7807c0aeca3bf4fdb18309ce993f6e35 test_output/SRR891268_1/fastqc_data.txt +6778518c16860c4300ed575d1dfdd43e test_output/SRR891268_1/fastqc_data.txt 69e7d0c53cd2e67117637c408b65333a test_output/SRR891268_1/summary.txt -54a383a9dd615f4130cbdab202829173 test_output/SRR6059706.lite.1/fastqc_data.txt -a5545633e81fc57fd03ff6ff7ba4fb8f test_output/SRR6059706.lite.1/summary.txt -21b11f43971f424267f6e32a880aaea9 test_output/SRR6387347/fastqc_data.txt +f05047b2a21949570ca2ae45b73b8da0 test_output/SRR6059706.lite.1/fastqc_data.txt +e348e4bcc7fc6f05e989ac7858d2b287 test_output/SRR6059706.lite.1/summary.txt +db82cb634e7e9b2f30472a069ba082b0 test_output/SRR6387347/fastqc_data.txt a61f65047e76f93300967cf399d044de test_output/SRR6387347/summary.txt -65519f6bcfd726b3219d7f646ad1d3b5 test_output/SRR3897196_1/fastqc_data.txt +905dcc1d5d135e18df379cc1edf778b6 test_output/SRR3897196_1/fastqc_data.txt b736ee95d5c450ef5c0dda31957b6818 test_output/SRR3897196_1/summary.txt -109cf880b376fcfba44b9637dcc045a6 test_output/SRR10143153_1/fastqc_data.txt +f7eb7940ad9836764b695d2abb0426da test_output/SRR10143153_1/fastqc_data.txt 9ad191925d47a57d4f8b12f21ba0a7c3 test_output/SRR10143153_1/summary.txt -dfebd8cc85b17f954fb14287a2dc8169 test_output/SRR1853178_2/fastqc_data.txt -7645f17be12d149347dab6033aabdeb3 test_output/SRR1853178_2/summary.txt +508cdda67a1ea7ddc756aa3656d1079b test_output/SRR1853178_2/fastqc_data.txt +c331d0f7a6aa9d72be41ac531f9ba269 test_output/SRR1853178_2/summary.txt