From 24fdef8c3fe2d3da305ee960cdc7c1ea6a128d59 Mon Sep 17 00:00:00 2001 From: Delaney Sullivan Date: Sun, 29 Sep 2024 21:04:19 -0700 Subject: [PATCH 1/4] added a check for bustools count when multiple ECs are listed for a UMI (b/c flag column) --- src/bustools_count.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 85ce268..9cabe56 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -170,7 +170,7 @@ void bustools_count(Bustools_opt &opt) { // v[i..j-1] share the same UMI ecs.resize(0); for (size_t k = i; k < j; k++) { - ecs.push_back(v[k].ec); + if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec); } if (opt.umi_gene_collapse) { @@ -295,7 +295,7 @@ void bustools_count(Bustools_opt &opt) { ecs.resize(0); uint32_t counts = 0; for (size_t k = i; k < j; k++) { - ecs.push_back(v[k].ec); + if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec); counts += v[k].count; } From c3bf68644237724eb0fe9101095d6f1d80ab60eb Mon Sep 17 00:00:00 2001 From: Pall Melsted Date: Tue, 1 Oct 2024 16:17:11 -0700 Subject: [PATCH 2/4] adds --exclude to bustools extract --- src/Common.hpp | 4 ++ src/bustools_extract.cpp | 124 ++++++++++++++++++++++++++------------- src/bustools_main.cpp | 17 ++++++ 3 files changed, 104 insertions(+), 41 deletions(-) diff --git a/src/Common.hpp b/src/Common.hpp index 32d0060..d3430c5 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -113,6 +113,10 @@ struct Bustools_opt bool text_dumppad = false; bool text_showall = false; + /* extract */ + bool extract_exclude = false; + bool extract_include = true; + /* linker */ int start, end; diff --git a/src/bustools_extract.cpp b/src/bustools_extract.cpp index 12c6f3c..2f01be4 100644 --- a/src/bustools_extract.cpp +++ b/src/bustools_extract.cpp @@ -59,46 +59,11 @@ void bustools_extract(const Bustools_opt &opt) { std::vector seq(opt.nFastqs, nullptr); uint32_t iRead = 0; size_t iFastq = 0; - if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { - std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl; - goto end_extract; - } + uint32_t lastFlag = 0; - while (true) { - in.read((char *) p, N * sizeof(BUSData)); - size_t rc = in.gcount() / sizeof(BUSData); - if (rc == 0) { - break; - } - nr += rc; - for (size_t i = 0; i < rc; ++i) { - while (iRead < p[i].flags) { - for (const auto &s : seq) { - int err_kseq_read = kseq_read(s); - if (err_kseq_read == -1) { // Reached EOF - if (iFastq == opt.fastq.size()) { // Done with all files - std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl; - goto end_extract; - } else { - if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { - std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl; - goto end_extract; - } - } - } else if (err_kseq_read == -2) { - std::cerr << "Error: truncated FASTQ" << std::endl; - goto end_extract; - } - } - ++iRead; - } - if (iRead > p[i].flags) { - std::cerr << "BUS file not sorted by flag" << std::endl; - goto end_extract; - } - - for (int i = 0; i < opt.nFastqs; ++i) { + auto write_seq_to_file = [&opt, &buf, &outFastq] (std::vector &seq) { + for (int i = 0; i < opt.nFastqs; ++i) { int bufLen = 1; // Already have @ character in buffer memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l); @@ -127,14 +92,91 @@ void bustools_extract(const Bustools_opt &opt) { buf[bufLen++] = '\n'; - if (gzwrite(outFastq[i], buf, bufLen) != bufLen) { - std::cerr << "Error writing to FASTQ" << std::endl; - goto end_extract; + if (gzwrite(outFastq[i], buf, bufLen) != bufLen) { + return false; + } + } + return true; + }; + + bool tail = false; + bool finished = false; + size_t iFlag = 0; + size_t rc = 0; + + if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { + std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl; + goto end_extract; + } + + // fill in the first N BUS records + in.read((char *) p, N * sizeof(BUSData)); + rc = in.gcount() / sizeof(BUSData); + nr += rc; + tail = rc==0; + + while (true) { + // read in the sequence + if (opt.extract_include && iRead == p[iFlag].flags) { + if (!write_seq_to_file(seq)) { + std::cerr << "Error writing to FASTQ" << std::endl; + goto end_extract; + } + } + + + if (opt.extract_exclude && (iRead < p[iFlag].flags || tail)) { + if (!write_seq_to_file(seq)) { + std::cerr << "Error writing to FASTQ" << std::endl; + goto end_extract; + } + } + + if (!tail && iRead == p[iFlag].flags) { + // read the next flag from the next bus record + iFlag++; + + if (iFlag == rc) { + // read the next batch of bus + in.read((char *) p, N * sizeof(BUSData)); + rc = in.gcount() / sizeof(BUSData); + iFlag = 0; + tail = rc==0; + } + } + // fill the next read + for (const auto &s : seq) { + int err_kseq_read = kseq_read(s); + if (err_kseq_read == -1) { // Reached EOF + // check if we are done with all files + if (iFastq == opt.fastq.size()) { // Done with all files + finished = true; + } else { + if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { + std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl; + goto end_extract; + } } + } else if (err_kseq_read == -2) { + std::cerr << "Error: truncated FASTQ" << std::endl; + goto end_extract; + } + } + ++iRead; + + if (finished) { + if (iFlag < rc) { + std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl; + goto end_extract; } + break; } + + } + + std::cerr << "Read in " << nr << " BUS records" << std::endl; end_extract: diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index 2ef8ac4..1b0eb50 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -989,6 +989,8 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt) {"fastq", required_argument, 0, 'f'}, {"nFastqs", required_argument, 0, 'N'}, {"pipe", no_argument, 0, 'p'}, + {"exclude", no_argument, 0, 'x'}, + {"include", no_argument, 0, 'i'}, {0, 0, 0, 0}}; int option_index = 0, c; @@ -1012,6 +1014,13 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt) case '?': opt.parse_error = true; break; + case 'x': + opt.extract_exclude = true; + opt.extract_include = false; + break; + case 'i': + opt.extract_include = true; + break; default: break; } @@ -2551,6 +2560,12 @@ bool check_ProgramOptions_extract(Bustools_opt &opt) ret = false; } } + + if (opt.extract_exclude && opt.extract_include) + { + std::cerr << "Error: cannot specify both --exclude and --include" << std::endl; + ret = false; + } return ret; } @@ -2904,6 +2919,8 @@ void Bustools_extract_Usage() << "-o, --output Output directory for FASTQ files" << std::endl << "-f, --fastq FASTQ file(s) from which to extract reads (comma-separated list)" << std::endl << "-N, --nFastqs Number of FASTQ file(s) per run" << std::endl + << "-x, --exclude Exclude reads in the BUS file from the specified FASTQ file(s)" << std::endl + << "-i, --include Include reads in the BUS file from the specified FASTQ file(s)" << std::endl << std::endl; } From 1ff5a207d8b825f2686468de2c012aa9aec29db2 Mon Sep 17 00:00:00 2001 From: Pall Melsted Date: Wed, 2 Oct 2024 18:43:20 -0700 Subject: [PATCH 3/4] fixes bug when several sets of fastq pairs are used --- src/bustools_extract.cpp | 80 +++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/src/bustools_extract.cpp b/src/bustools_extract.cpp index 2f01be4..045a283 100644 --- a/src/bustools_extract.cpp +++ b/src/bustools_extract.cpp @@ -18,7 +18,7 @@ inline bool open_fastqs( for (int i = 0; i < opt.nFastqs; ++i) { gzclose(outFastq[i]); - outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w"); + outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w1"); gzclose(inFastq[i]); inFastq[i] = gzopen(opt.fastq[iFastq].c_str(), "r"); @@ -26,10 +26,7 @@ inline bool open_fastqs( kseq_destroy(seq[i]); } seq[i] = kseq_init(inFastq[i]); - if (kseq_read(seq[i]) < 0) { - return false; - } - + ++iFastq; } return true; @@ -116,7 +113,55 @@ void bustools_extract(const Bustools_opt &opt) { tail = rc==0; while (true) { - // read in the sequence + // fill the next read + + for (int si = 0; si < seq.size(); ++si) { + const auto &s = seq[si]; + int err_kseq_read = kseq_read(s); + if (err_kseq_read == -1) { // Reached EOF + if (si != 0) { + std::cerr << "Error: truncated FASTQ" << std::endl; + goto end_extract; + } else { + // let's make sure that all the files are also EOF + for (int sii = 1; sii < seq.size(); ++sii) { + int err_kseq_read2 = kseq_read(seq[sii]); + if (err_kseq_read2 != -1) { + std::cerr << "Error: truncated FASTQ" << std::endl; + goto end_extract; + } + } + } + // check if we are done with all files + if (iFastq == opt.fastq.size()) { // Done with all files + finished = true; + break; + } else { + if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { + std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl; + goto end_extract; + } + + // read the first read + err_kseq_read = kseq_read(seq[si]); + if (err_kseq_read == -1) { + finished = true; + break; + } + } + } + + if (err_kseq_read == -2) { + std::cerr << "Error: truncated FASTQ" << std::endl; + goto end_extract; + } + } + + if (finished) { + break; + } + + // inclusion, check if the current read matches the next unproccessed flag if (opt.extract_include && iRead == p[iFlag].flags) { if (!write_seq_to_file(seq)) { std::cerr << "Error writing to FASTQ" << std::endl; @@ -124,7 +169,7 @@ void bustools_extract(const Bustools_opt &opt) { } } - + // exclusion, make sure the current read does not match the next unproccessed flag or that we are in tail mode if (opt.extract_exclude && (iRead < p[iFlag].flags || tail)) { if (!write_seq_to_file(seq)) { std::cerr << "Error writing to FASTQ" << std::endl; @@ -132,6 +177,7 @@ void bustools_extract(const Bustools_opt &opt) { } } + // if we have not exhausted the if (!tail && iRead == p[iFlag].flags) { // read the next flag from the next bus record iFlag++; @@ -140,28 +186,12 @@ void bustools_extract(const Bustools_opt &opt) { // read the next batch of bus in.read((char *) p, N * sizeof(BUSData)); rc = in.gcount() / sizeof(BUSData); + nr += rc; iFlag = 0; tail = rc==0; } } - // fill the next read - for (const auto &s : seq) { - int err_kseq_read = kseq_read(s); - if (err_kseq_read == -1) { // Reached EOF - // check if we are done with all files - if (iFastq == opt.fastq.size()) { // Done with all files - finished = true; - } else { - if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) { - std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl; - goto end_extract; - } - } - } else if (err_kseq_read == -2) { - std::cerr << "Error: truncated FASTQ" << std::endl; - goto end_extract; - } - } + ++iRead; if (finished) { From ead6061cf6a4d6d3c50ed22f101a487df95dcc4e Mon Sep 17 00:00:00 2001 From: Delaney Sullivan Date: Sun, 20 Oct 2024 22:17:32 -0700 Subject: [PATCH 4/4] version bump to 0.44.1 --- src/Common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common.hpp b/src/Common.hpp index d3430c5..28c0dee 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -12,7 +12,7 @@ #include "roaring.hh" #include "hash.hpp" -#define BUSTOOLS_VERSION "0.44.0" +#define BUSTOOLS_VERSION "0.44.1" #define u_map_ std::unordered_map enum CAPTURE_TYPE : char