Skip to content

Commit

Permalink
Merge branch 'DOR-608_bed_file_support_in_basecaller' into 'master'
Browse files Browse the repository at this point in the history
DOR-608 Bed-file support in basecaller

Closes DOR-608

See merge request machine-learning/dorado!1045
  • Loading branch information
kdolan1973 committed May 31, 2024
2 parents a56f7f3 + bf0eaed commit f9beb39
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 11 deletions.
16 changes: 14 additions & 2 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ void setup(const std::vector<std::string>& args,
const std::vector<fs::path>& remora_models,
const std::string& device,
const std::string& ref,
const std::string& bed,
size_t num_runners,
size_t remora_batch_size,
size_t num_remora_threads,
Expand Down Expand Up @@ -285,7 +286,7 @@ void setup(const std::vector<std::string>& args,
if (enable_aligner) {
auto index_file_access = std::make_shared<alignment::IndexFileAccess>();
aligner = pipeline_desc.add_node<AlignerNode>({current_sink_node}, index_file_access, ref,
"", aligner_options,
bed, aligner_options,
thread_allocations.aligner_threads);
current_sink_node = aligner;
}
Expand Down Expand Up @@ -538,6 +539,10 @@ int basecaller(int argc, char* argv[]) {
parser.visible.add_argument("--reference")
.help("Path to reference for alignment.")
.default_value(std::string(""));
parser.visible.add_argument("--bed-file")
.help("Optional bed-file. If specified, overlaps between the alignments and bed-file "
"entries will be counted, and recorded in BAM output using the 'bh' read tag.")
.default_value(std::string(""));

parser.visible.add_argument("--kit-name")
.help("Enable barcoding with the provided kit name. Choose from: " +
Expand Down Expand Up @@ -632,6 +637,12 @@ int basecaller(int argc, char* argv[]) {
return EXIT_FAILURE;
}

if (parser.visible.get<std::string>("--reference").empty() &&
!parser.visible.get<std::string>("--bed-file").empty()) {
spdlog::error("--bed-file cannot be used without --reference.");
return EXIT_FAILURE;
}

if (emit_fastq) {
if (model_selection.has_mods_variant() || !mod_bases.empty() || !mod_bases_models.empty()) {
spdlog::error(
Expand Down Expand Up @@ -759,7 +770,8 @@ int basecaller(int argc, char* argv[]) {

try {
setup(args, model_config, data, mods_model_paths, device,
parser.visible.get<std::string>("--reference"), default_parameters.num_runners,
parser.visible.get<std::string>("--reference"),
parser.visible.get<std::string>("--bed-file"), default_parameters.num_runners,
default_parameters.remora_batchsize, default_parameters.remora_threads,
methylation_threshold, output_mode, parser.visible.get<bool>("--emit-moves"),
parser.visible.get<int>("--max-reads"), parser.visible.get<int>("--min-qscore"),
Expand Down
12 changes: 5 additions & 7 deletions dorado/read_pipeline/AlignerNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ AlignerNode::AlignerNode(std::shared_ptr<alignment::IndexFileAccess> index_file_
m_index_file_access(std::move(index_file_access)) {
auto header_sequence_records = m_index_for_bam_messages->get_sequence_records_for_header();
if (!bed_file.empty()) {
m_bed_file_for_bam_messages.load(bed_file);
m_bed_file.load(bed_file);
for (const auto& entry : header_sequence_records) {
m_header_sequences_for_bam_messages.emplace_back(entry.first);
m_header_sequence_names.emplace_back(entry.first);
}
}
start_input_processing(&AlignerNode::input_thread_fn, this);
Expand Down Expand Up @@ -130,11 +130,9 @@ void AlignerNode::input_thread_fn() {
auto records = alignment::Minimap2Aligner(m_index_for_bam_messages)
.align(bam_message.bam_ptr.get(), tbuf);
for (auto& record : records) {
if (!m_bed_file_for_bam_messages.filename().empty() &&
!(record->core.flag & BAM_FUNMAP)) {
if (!m_bed_file.filename().empty() && !(record->core.flag & BAM_FUNMAP)) {
auto ref_id = record->core.tid;
add_bed_hits_to_record(m_header_sequences_for_bam_messages.at(ref_id),
record.get());
add_bed_hits_to_record(m_header_sequence_names.at(ref_id), record.get());
}
send_message_to_sink(BamMessage{std::move(record), bam_message.client_info});
}
Expand All @@ -157,7 +155,7 @@ void AlignerNode::add_bed_hits_to_record(const std::string& genome, bam1_t* reco
size_t genome_end = bam_endpos(record);
char direction = (bam_is_rev(record)) ? '-' : '+';
int bed_hits = 0;
for (const auto& interval : m_bed_file_for_bam_messages.entries(genome)) {
for (const auto& interval : m_bed_file.entries(genome)) {
if (!(interval.start >= genome_end || interval.end <= genome_start) &&
(interval.strand == direction || interval.strand == '.')) {
bed_hits++;
Expand Down
4 changes: 2 additions & 2 deletions dorado/read_pipeline/AlignerNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ class AlignerNode : public MessageSink {
void add_bed_hits_to_record(const std::string& genome, bam1_t* record);

std::shared_ptr<const alignment::Minimap2Index> m_index_for_bam_messages{};
std::vector<std::string> m_header_sequences_for_bam_messages{};
std::vector<std::string> m_header_sequence_names{};
std::shared_ptr<alignment::IndexFileAccess> m_index_file_access{};
alignment::BedFile m_bed_file_for_bam_messages{};
alignment::BedFile m_bed_file{};
};

} // namespace dorado

0 comments on commit f9beb39

Please sign in to comment.