From 49610061a4f587cea432597c3721d0d7724e0f20 Mon Sep 17 00:00:00 2001 From: Yenaled Date: Fri, 29 Dec 2023 10:22:46 -0800 Subject: [PATCH 1/2] improve bustools count --split for overlapping regions Also cleaned up some of the commands-line help menu --- src/Common.hpp | 2 +- src/bustools_count.cpp | 6 +++--- src/bustools_main.cpp | 25 +++++++++++++------------ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/Common.hpp b/src/Common.hpp index 14bdb87..51451b1 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -12,7 +12,7 @@ #include "roaring.hh" #include "hash.hpp" -#define BUSTOOLS_VERSION "0.43.1" +#define BUSTOOLS_VERSION "0.43.2" #define u_map_ std::unordered_map enum CAPTURE_TYPE : char diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 26826aa..801503e 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -26,7 +26,7 @@ void bustools_count(Bustools_opt &opt) { std::vector tx_split; // Store transcript names for split std::vector tx_split_lookup; // Map transcript IDs to mtx status bool count_split = !opt.count_split.empty(); - int count_mtx_priority = !opt.count_gene_multimapping && count_split ? 1 : 0; // 1 = when something in tx_split overlaps something not in tx_split, prioritize the latter (useful for dealing in cases when introns of one gene overlap exons of another gene [we prioritize the exons] + int count_mtx_priority = (!opt.count_gene_multimapping || opt.count_collapse) && count_split ? 1 : 0; // 1 = when something in tx_split overlaps something not in tx_split, prioritize the latter (useful for dealing in cases when introns of one gene overlap exons of another gene [we prioritize the exons] parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); u_map_ genenames; @@ -89,9 +89,9 @@ void bustools_count(Bustools_opt &opt) { // Essentially, we are removing all tx's that belong to (or not belong to) tx_split in the equivalence classes // This handles instances in which a read maps to exon of one gene but intron of another (likely overlapping) gene to avoid discarding the record // This is done at read-level (not UMI-level) so if one UMI maps to one gene's exon but another UMI maps to the other gene's intron, we still discard it - if (count_mtx_priority == 1 && !found) + if (count_mtx_priority == 1 && found) new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); - else if (count_mtx_priority == 2 && found) + else if (count_mtx_priority == 2 && !found) new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); } } diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index 7ba4b30..2ef8ac4 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -2661,17 +2661,18 @@ void Bustools_Usage() << "count Generate count matrices from a BUS file" << std::endl << "inspect Produce a report summarizing a BUS file" << std::endl << "allowlist Generate an on-list from a BUS file" << std::endl - << "project Project a BUS file to gene sets" << std::endl + //<< "project Project a BUS file to gene sets" << std::endl << "capture Capture records from a BUS file" << std::endl - << "merge Merge bus files from same experiment" << std::endl + //<< "merge Merge bus files from same experiment" << std::endl << "text Convert a binary BUS file to a tab-delimited text file" << std::endl + << "fromtext Convert a tab-delimited text file to a binary BUS file" << std::endl << "extract Extract FASTQ reads correspnding to reads in BUS file" << std::endl - << "predict Correct the count matrix using prediction of unseen species" << std::endl - << "collapse Turn BUS files into a BUG file" << std::endl - << "clusterhist Create UMI histograms per cluster" << std::endl - << "linker Remove section of barcodes in BUS files" << std::endl + //<< "predict Correct the count matrix using prediction of unseen species" << std::endl + //<< "collapse Turn BUS files into a BUG file" << std::endl + //<< "clusterhist Create UMI histograms per cluster" << std::endl + //<< "linker Remove section of barcodes in BUS files" << std::endl << "compress Compress a BUS file" << std::endl - << "inflate Decompress a BUSZ (compressed BUS) file" << std::endl + << "decompress Decompress a BUSZ (compressed BUS) file" << std::endl << "version Prints version number" << std::endl << "cite Prints citation information" << std::endl << std::endl @@ -2787,13 +2788,13 @@ void Bustools_count_Usage() << "-t, --txnames File with names of transcripts" << std::endl << " --genecounts Aggregate counts to genes only" << std::endl << " --umi-gene Perform gene-level collapsing of UMIs" << std::endl - << " --em Estimate gene abundances using EM algorithm" << std::endl + //<< " --em Estimate gene abundances using EM algorithm" << std::endl << " --cm Count multiplicities instead of UMIs" << std::endl << "-s, --split Split output matrix in two (plus ambiguous) based on transcripts supplied in this file" << std::endl << "-m, --multimapping Include bus records that pseudoalign to multiple genes" << std::endl - << " --hist Output copy per UMI histograms for all genes" << std::endl - << "-d --downsample Specify a factor between 0 and 1 specifying how much to downsample" << std::endl - << " --rawcounts The count matrix will contain raw counts instead of UMI counts" << std::endl + //<< " --hist Output copy per UMI histograms for all genes" << std::endl + //<< "-d --downsample Specify a factor between 0 and 1 specifying how much to downsample" << std::endl + //<< " --rawcounts The count matrix will contain raw counts instead of UMI counts" << std::endl << std::endl; } @@ -3164,7 +3165,7 @@ int main(int argc, char **argv) exit(1); } } - else if (cmd == "whitelist" || cmd == "allowlist") + else if (cmd == "whitelist" || cmd == "allowlist" || cmd == "onlist") { if (disp_help) { From cd6bf09f39ac55a8738caee6a349ed0e13655b69 Mon Sep 17 00:00:00 2001 From: Yenaled Date: Wed, 3 Jan 2024 14:19:18 -0800 Subject: [PATCH 2/2] cleaned up code for clarity for release 0.43.2 --- src/bustools_count.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 801503e..fc38743 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -91,8 +91,6 @@ void bustools_count(Bustools_opt &opt) { // This is done at read-level (not UMI-level) so if one UMI maps to one gene's exon but another UMI maps to the other gene's intron, we still discard it if (count_mtx_priority == 1 && found) new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); - else if (count_mtx_priority == 2 && !found) - new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); } } if (count_mtx_priority != 0)