From 188a585826c65c58f39c7b21b90565380a8b5055 Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Wed, 22 May 2024 10:09:07 +0100 Subject: [PATCH 1/3] Partial changes needed to support bedfiles in basecaller. --- dorado/alignment/BedFile.h | 1 + dorado/cli/basecaller.cpp | 16 ++++++++++++++-- dorado/read_pipeline/AlignerNode.cpp | 25 ++++++++++++++++++------- dorado/read_pipeline/AlignerNode.h | 5 +++-- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/dorado/alignment/BedFile.h b/dorado/alignment/BedFile.h index 8d3c0aef..172194e2 100644 --- a/dorado/alignment/BedFile.h +++ b/dorado/alignment/BedFile.h @@ -30,6 +30,7 @@ class BedFile { ~BedFile() = default; bool load(const std::string& index_filename); + bool empty() const { return m_file_name.empty(); } const Entries& entries(const std::string& genome) const; diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index d22894ce..4052f0ce 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -122,6 +122,7 @@ void setup(const std::vector& args, const std::vector& remora_models, const std::string& device, const std::string& ref, + const std::string& bed, size_t num_runners, size_t remora_batch_size, size_t num_remora_threads, @@ -228,7 +229,7 @@ void setup(const std::vector& args, if (enable_aligner) { auto index_file_access = std::make_shared(); aligner = pipeline_desc.add_node({current_sink_node}, index_file_access, ref, - "", aligner_options, + bed, aligner_options, thread_allocations.aligner_threads); current_sink_node = aligner; } @@ -480,6 +481,10 @@ int basecaller(int argc, char* argv[]) { parser.visible.add_argument("--reference") .help("Path to reference for alignment.") .default_value(std::string("")); + parser.visible.add_argument("--bed-file") + .help("Optional bed-file. If specified, overlaps between the alignments and bed-file " + "entries will be counted, and recorded in BAM output using the 'bh' read tag.") + .default_value(std::string("")); parser.visible.add_argument("--kit-name") .help("Enable barcoding with the provided kit name. Choose from: " + @@ -569,6 +574,12 @@ int basecaller(int argc, char* argv[]) { return EXIT_FAILURE; } + if (parser.visible.get("--reference").empty() && + !parser.visible.get("--bed-file").empty()) { + spdlog::error("--bed-file cannot be used without --reference."); + return EXIT_FAILURE; + } + if (emit_fastq) { if (model_selection.has_mods_variant() || !mod_bases.empty() || !mod_bases_models.empty()) { spdlog::error( @@ -689,7 +700,8 @@ int basecaller(int argc, char* argv[]) { try { setup(args, model_config, data, mods_model_paths, device, - parser.visible.get("--reference"), default_parameters.num_runners, + parser.visible.get("--reference"), + parser.visible.get("--bed-file"), default_parameters.num_runners, default_parameters.remora_batchsize, default_parameters.remora_threads, methylation_threshold, output_mode, parser.visible.get("--emit-moves"), parser.visible.get("--max-reads"), parser.visible.get("--min-qscore"), diff --git a/dorado/read_pipeline/AlignerNode.cpp b/dorado/read_pipeline/AlignerNode.cpp index 62719fd2..1ffa0ae4 100644 --- a/dorado/read_pipeline/AlignerNode.cpp +++ b/dorado/read_pipeline/AlignerNode.cpp @@ -41,6 +41,8 @@ std::shared_ptr load_and_get_index( return index_file_access.get_index(index_file, options); } +std::string extract_genome_from_alignment_string(const std::string& alignment_string) {} + } // namespace namespace dorado { @@ -56,9 +58,9 @@ AlignerNode::AlignerNode(std::shared_ptr index_file_ m_index_file_access(std::move(index_file_access)) { auto header_sequence_records = m_index_for_bam_messages->get_sequence_records_for_header(); if (!bed_file.empty()) { - m_bed_file_for_bam_messages.load(bed_file); + m_bed_file.load(bed_file); for (const auto& entry : header_sequence_records) { - m_header_sequences_for_bam_messages.emplace_back(entry.first); + m_header_sequence_names.emplace_back(entry.first); } } start_input_processing(&AlignerNode::input_thread_fn, this); @@ -108,6 +110,15 @@ void AlignerNode::align_read_common(ReadCommon& read_common, mm_tbuf_t* tbuf) { } alignment::Minimap2Aligner(index).align(read_common, tbuf); + if (!m_bed_file.empty()) { + // Note that this is only used by dorado basecaller and dorado aligner. For the + // basecall server m_bed_file is always empty, and bed-file hits are checked in + // the core-cpp code. + auto genome = extract_genome_from_alignment_string(read_common.alignment_string); + if (!genome.empty()) { + add_bed_hits_to_read(genome, read_common); + } + } } void AlignerNode::input_thread_fn() { @@ -123,11 +134,9 @@ void AlignerNode::input_thread_fn() { auto records = alignment::Minimap2Aligner(m_index_for_bam_messages) .align(bam_message.bam_ptr.get(), tbuf); for (auto& record : records) { - if (!m_bed_file_for_bam_messages.filename().empty() && - !(record->core.flag & BAM_FUNMAP)) { + if (!m_bed_file.filename().empty() && !(record->core.flag & BAM_FUNMAP)) { auto ref_id = record->core.tid; - add_bed_hits_to_record(m_header_sequences_for_bam_messages.at(ref_id), - record.get()); + add_bed_hits_to_record(m_header_sequence_names.at(ref_id), record.get()); } send_message_to_sink(BamMessage{std::move(record), bam_message.client_info}); } @@ -150,7 +159,7 @@ void AlignerNode::add_bed_hits_to_record(const std::string& genome, bam1_t* reco size_t genome_end = bam_endpos(record); char direction = (bam_is_rev(record)) ? '-' : '+'; int bed_hits = 0; - for (const auto& interval : m_bed_file_for_bam_messages.entries(genome)) { + for (const auto& interval : m_bed_file.entries(genome)) { if (!(interval.start >= genome_end || interval.end <= genome_start) && (interval.strand == direction || interval.strand == '.')) { bed_hits++; @@ -160,4 +169,6 @@ void AlignerNode::add_bed_hits_to_record(const std::string& genome, bam1_t* reco bam_aux_append(record, "bh", 'i', sizeof(bed_hits), (uint8_t*)&bed_hits); } +void AlignerNode::add_bed_hits_to_read(const std::string& genome, ReadCommon& read_common) {} + } // namespace dorado diff --git a/dorado/read_pipeline/AlignerNode.h b/dorado/read_pipeline/AlignerNode.h index e87fa4cd..88661d31 100644 --- a/dorado/read_pipeline/AlignerNode.h +++ b/dorado/read_pipeline/AlignerNode.h @@ -42,11 +42,12 @@ class AlignerNode : public MessageSink { std::shared_ptr get_index(const ClientInfo& client_info); void align_read_common(ReadCommon& read_common, mm_tbuf_t* tbuf); void add_bed_hits_to_record(const std::string& genome, bam1_t* record); + void add_bed_hits_to_read(const std::string& genome, ReadCommon& read_common); std::shared_ptr m_index_for_bam_messages{}; - std::vector m_header_sequences_for_bam_messages{}; + std::vector m_header_sequence_names{}; std::shared_ptr m_index_file_access{}; - alignment::BedFile m_bed_file_for_bam_messages{}; + alignment::BedFile m_bed_file{}; }; } // namespace dorado From fc5ec7a6b8cb1a624ff4cdb60b5396bb52bbd275 Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Thu, 30 May 2024 14:03:46 +0100 Subject: [PATCH 2/3] Removed unused function. --- dorado/read_pipeline/AlignerNode.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/dorado/read_pipeline/AlignerNode.cpp b/dorado/read_pipeline/AlignerNode.cpp index 1851b028..b2a2c4b1 100644 --- a/dorado/read_pipeline/AlignerNode.cpp +++ b/dorado/read_pipeline/AlignerNode.cpp @@ -43,8 +43,6 @@ std::shared_ptr load_and_get_index( return index_file_access.get_index(index_file, options); } -std::string extract_genome_from_alignment_string(const std::string& alignment_string) {} - } // namespace namespace dorado { From bf0eaed86ab3bca31ccb1b7d1156f443c131906d Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Thu, 30 May 2024 14:14:07 +0100 Subject: [PATCH 3/3] Removed unneeded function. --- dorado/alignment/BedFile.h | 1 - 1 file changed, 1 deletion(-) diff --git a/dorado/alignment/BedFile.h b/dorado/alignment/BedFile.h index 172194e2..8d3c0aef 100644 --- a/dorado/alignment/BedFile.h +++ b/dorado/alignment/BedFile.h @@ -30,7 +30,6 @@ class BedFile { ~BedFile() = default; bool load(const std::string& index_filename); - bool empty() const { return m_file_name.empty(); } const Entries& entries(const std::string& genome) const;