diff --git a/src/derive.rs b/src/derive.rs index 104bfe3..6a28e5f 100644 --- a/src/derive.rs +++ b/src/derive.rs @@ -6,3 +6,4 @@ pub mod endedness; pub mod instrument; pub mod junction_annotation; pub mod readlen; +pub mod strandedness; diff --git a/src/derive/command.rs b/src/derive/command.rs index 5d4a593..8ded0d3 100644 --- a/src/derive/command.rs +++ b/src/derive/command.rs @@ -5,6 +5,7 @@ pub mod endedness; pub mod instrument; pub mod junction_annotation; pub mod readlen; +pub mod strandedness; use clap::Args; use clap::Subcommand; @@ -36,6 +37,10 @@ pub enum DeriveSubcommand { /// Derives the read length of the file. Readlen(self::readlen::DeriveReadlenArgs), + /// Derives the strandedness of the RNA-Seq file. + /// This subcommand requires a GFF file. + Strandedness(self::strandedness::DeriveStrandednessArgs), + /// Annotates junctions in the file. /// This subcommand requires a GFF file with features to annotate. /// This subcommand does not "derive" anything, but is included here for diff --git a/src/derive/command/endedness.rs b/src/derive/command/endedness.rs index 7a87136..bc65456 100644 --- a/src/derive/command/endedness.rs +++ b/src/derive/command/endedness.rs @@ -14,14 +14,13 @@ use tracing::info; use tracing::trace; use crate::derive::endedness::compute; -use crate::derive::endedness::compute::{ - validate_read_group_info, OrderingFlagsCounts, OVERALL, UNKNOWN_READ_GROUP, -}; +use crate::derive::endedness::compute::OrderingFlagsCounts; use crate::utils::args::arg_in_range as deviance_in_range; use crate::utils::args::NumberOfRecords; use crate::utils::display::RecordCounter; use crate::utils::formats::bam::ParsedBAMFile; use crate::utils::formats::utils::IndexCheck; +use crate::utils::read_groups::{validate_read_group_info, OVERALL, UNKNOWN_READ_GROUP}; /// Clap arguments for the `ngs derive endedness` subcommand. #[derive(Args)] diff --git a/src/derive/command/junction_annotation.rs b/src/derive/command/junction_annotation.rs index fa1d41b..ac50cb4 100644 --- a/src/derive/command/junction_annotation.rs +++ b/src/derive/command/junction_annotation.rs @@ -31,34 +31,34 @@ pub struct JunctionAnnotationArgs { /// Name of the exon region feature for the gene model used. #[arg(long, value_name = "STRING", default_value = "exon")] - pub exon_feature_name: String, + exon_feature_name: String, /// Minimum intron length to consider. /// An intron is defined as an `N` CIGAR operation of any length. #[arg(short = 'i', long, value_name = "USIZE", default_value = "50")] - pub min_intron_length: usize, + min_intron_length: usize, /// Minimum number of reads supporting a junction to be considered. - #[arg(short = 'r', long, value_name = "usize", default_value = "2")] - pub min_read_support: usize, + #[arg(short = 'r', long, value_name = "USIZE", default_value = "2")] + min_read_support: usize, /// Minumum mapping quality for a record to be considered. /// Set to 0 to disable this filter and allow reads _without_ /// a mapping quality to be considered. #[arg(short, long, value_name = "U8", default_value = "30")] - pub min_mapq: u8, + min_mapq: u8, /// Do not count supplementary alignments. #[arg(long)] - pub no_supplementary: bool, + no_supplementary: bool, /// Do count secondary alignments. #[arg(long)] - pub count_secondary: bool, + count_secondary: bool, /// Do count duplicates. #[arg(long)] - pub count_duplicates: bool, + count_duplicates: bool, } /// Main function for the `ngs derive junction_annotation` subcommand. @@ -131,7 +131,7 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> { // (3) Summarize found junctions. compute::summarize(&mut results, ¶ms); - // (3) Print the output to stdout as JSON (more support for different output + // (4) Print the output to stdout as JSON (more support for different output // types may be added in the future, but for now, only JSON). let output = serde_json::to_string_pretty(&results).unwrap(); print!("{}", output); diff --git a/src/derive/command/strandedness.rs b/src/derive/command/strandedness.rs new file mode 100644 index 0000000..6d4ab7b --- /dev/null +++ b/src/derive/command/strandedness.rs @@ -0,0 +1,213 @@ +//! Functionality relating to the `ngs derive strandedness` subcommand itself. + +use std::collections::HashMap; +use std::fs::File; +use std::path::PathBuf; + +use anyhow::bail; +use anyhow::Context; +use clap::Args; +use noodles::bam; +use noodles::gff; +use noodles::sam; +use rust_lapper::{Interval, Lapper}; +use tracing::debug; +use tracing::info; + +use crate::derive::strandedness::compute; +use crate::derive::strandedness::compute::ParsedBAMFile; +use crate::derive::strandedness::compute::StrandednessFilters; +use crate::utils::formats; + +/// Clap arguments for the `ngs derive strandedness` subcommand. +#[derive(Args)] +pub struct DeriveStrandednessArgs { + /// Source BAM. + #[arg(value_name = "BAM")] + src: PathBuf, + + /// Features GFF file. + #[arg(short = 'f', long, required = true, value_name = "PATH")] + features_gff: PathBuf, + + /// When inconclusive, the test will repeat until this many tries have been reached. + /// Evidence of previous attempts is saved and reused, + /// leading to a larger sample size with multiple attempts. + #[arg(long, value_name = "USIZE", default_value = "3")] + max_tries: usize, + + /// Filter any genes that don't have at least `m` reads. + #[arg(short = 'm', long, value_name = "USIZE", default_value = "10")] + min_reads_per_gene: usize, + + /// How many genes to sample. + #[arg(short = 'n', long, value_name = "USIZE", default_value = "1000")] + num_genes: usize, + + /// Minimum mapping quality for a record to be considered. + /// Set to 0 to disable this filter and allow reads _without_ + /// a mapping quality to be considered. + #[arg(short = 'q', long, value_name = "U8", default_value = "30")] + min_mapq: u8, + + /// Consider all genes, not just protein coding genes. + #[arg(long)] + all_genes: bool, + + /// Name of the gene region feature for the gene model used. + #[arg(long, value_name = "STRING", default_value = "gene")] + gene_feature_name: String, + + /// Name of the exon region feature for the gene model used. + #[arg(long, value_name = "STRING", default_value = "exon")] + exon_feature_name: String, + + /// Do not count supplementary alignments. + #[arg(long)] + no_supplementary: bool, + + /// Do count secondary alignments. + #[arg(long)] + count_secondary: bool, + + /// Do count duplicates. + #[arg(long)] + count_duplicates: bool, + + /// Do count QC failed reads. + #[arg(long)] + count_qc_failed: bool, + + /// At most, search this many times for genes that satisfy our search criteria. + /// Default is 10 * --num-genes. + #[arg(long, value_name = "USIZE")] + max_iterations_per_try: Option, +} + +/// Main function for the `ngs derive strandedness` subcommand. +pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> { + info!("Starting derive strandedness subcommand."); + + // (1) Parse the GFF file and collect all gene features. + debug!("Reading all records in GFF."); + let mut gff = formats::gff::open(&args.features_gff) + .with_context(|| format!("opening GFF file: {}", args.features_gff.display()))?; + + let mut gene_records = Vec::new(); + let mut exon_records = Vec::new(); + let mut gene_metrics = compute::GeneRecordMetrics::default(); + let mut exon_metrics = compute::ExonRecordMetrics::default(); + for result in gff.records() { + let record = result.unwrap(); + if record.ty() == args.gene_feature_name { + // If --all-genes is set, keep the record. + // Otherwise, check the gene type or biotype and keep the record if it's protein coding. + // If the record does not have a gene type or biotype, discard it. + let mut keep_record = false; + if !args.all_genes { + let mut gene_type_value = None; + for entry in record.attributes().as_ref() { + gene_type_value = match entry.key() { + "gene_type" => Some(entry.value()), // Gencode + "gene_biotype" => Some(entry.value()), // ENSEMBL + "biotype" => Some(entry.value()), // also ENSEMBL + _ => gene_type_value, + }; + } + if let Some(gene_type_value) = gene_type_value { + if gene_type_value.to_lowercase().contains("protein") { + keep_record = true; + gene_metrics.protein_coding += 1; + } + } + } + gene_metrics.total += 1; + if keep_record { + gene_records.push(record); + } + } else if record.ty() == args.exon_feature_name { + exon_metrics.total += 1; + exon_records.push(record); + } + } + + debug!("Tabulating GFF gene and exon features."); + + if gene_records.is_empty() { + bail!("No gene records matched criteria. Check your GFF file and `--gene-feature-name` and `--all-genes` options."); + } + if exon_records.is_empty() { + bail!("No exon records matched criteria. Check your GFF file and `--exon-feature-name` option."); + } + + let mut exon_intervals: HashMap<&str, Vec>> = + HashMap::new(); + for record in &exon_records { + let seq_name = record.reference_sequence_name(); + let start: usize = record.start().into(); + let stop: usize = record.end().into(); + let strand = record.strand(); + + exon_intervals.entry(seq_name).or_default().push(Interval { + start, + stop, + val: strand, + }); + } + + let mut exons: HashMap<&str, Lapper> = HashMap::new(); + for (seq_name, intervals) in exon_intervals { + exons.insert(seq_name, Lapper::new(intervals)); + } + + debug!("Done reading GFF."); + + let mut reader = File::open(&args.src) + .map(bam::Reader::new) + .with_context(|| format!("opening BAM file: {}", args.src.display()))?; + let header = reader.read_header()?.parse()?; + let index = bam::bai::read(&args.src.with_extension("bam.bai")).with_context(|| { + format!( + "reading BAM index: {}", + args.src.with_extension("bam.bai").display() + ) + })?; + + let parsed_bam = ParsedBAMFile { + reader, + header, + index, + }; + + let filters = StrandednessFilters { + min_reads_per_gene: args.min_reads_per_gene, + min_mapq: args.min_mapq, + count_qc_failed: args.count_qc_failed, + no_supplementary: args.no_supplementary, + count_secondary: args.count_secondary, + count_duplicates: args.count_duplicates, + }; + + let max_iterations_per_try = args.max_iterations_per_try.unwrap_or(args.num_genes * 10); + let max_iterations_per_try = match max_iterations_per_try > gene_records.len() { + true => gene_records.len(), + false => max_iterations_per_try, + }; + + for try_num in 1..=args.max_tries { + info!("Starting try {} of {}", try_num, args.max_tries); + + compute::predict( + &parsed_bam, + &mut gene_records, + &exons, + max_iterations_per_try, + args.num_genes, + &filters, + &mut gene_metrics, + &mut exon_metrics, + )?; + } + + anyhow::Ok(()) +} diff --git a/src/derive/encoding/compute.rs b/src/derive/encoding/compute.rs index 40d9366..eb60962 100644 --- a/src/derive/encoding/compute.rs +++ b/src/derive/encoding/compute.rs @@ -72,7 +72,7 @@ pub fn predict(score_set: HashSet) -> Result bail!("This shouldn't be possible!"), + _ => unreachable!(), } anyhow::Ok(result) diff --git a/src/derive/endedness/compute.rs b/src/derive/endedness/compute.rs index 5990683..6ef50c9 100644 --- a/src/derive/endedness/compute.rs +++ b/src/derive/endedness/compute.rs @@ -1,22 +1,12 @@ //! Module holding the logic for computing the endedness of a BAM. -use lazy_static::lazy_static; -use noodles::sam::header; use serde::Serialize; use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; use tracing::warn; -// Strings used to index into the HashMaps used to store the Read Group ordering flags. -// Lazy statics are used to save memory. -lazy_static! { - /// String used to index into the HashMaps used to store the "overall" ordering flags. - pub static ref OVERALL: Arc = Arc::new(String::from("overall")); - - /// String used to index into th HashMaps used to store the "unknown_read_group" ordering flags. - pub static ref UNKNOWN_READ_GROUP: Arc = Arc::new(String::from("unknown_read_group")); -} +use crate::utils::read_groups::{OVERALL, UNKNOWN_READ_GROUP}; /// Struct holding the ordering flags for a single read group. #[derive(Debug, Clone)] @@ -157,44 +147,6 @@ impl DerivedEndednessResult { } } -/// Compares the read group tags found in the records -/// and the read groups found in the header. -/// Returns a vector of read group names that were found in the header -/// but not in the records. -pub fn validate_read_group_info( - found_rgs: &HashSet>, - header: &header::Header, -) -> Vec { - let mut rgs_in_header_not_records = Vec::new(); - let mut rgs_in_records_not_header = Vec::new(); - - for (rg_id, _) in header.read_groups() { - if !found_rgs.contains(rg_id) { - rgs_in_header_not_records.push(rg_id.to_string()); - } - } - if !rgs_in_header_not_records.is_empty() { - warn!( - "The following read groups were not found in the file: {:?}", - rgs_in_header_not_records - ); - } - - for rg_id in found_rgs { - if !header.read_groups().contains_key(rg_id.as_str()) { - rgs_in_records_not_header.push(rg_id.to_string()); - } - } - if !rgs_in_records_not_header.is_empty() { - warn!( - "The following read groups were not found in the header: {:?}", - rgs_in_records_not_header - ); - } - - rgs_in_header_not_records -} - fn calculate_reads_per_template( read_names: HashMap>>, ) -> HashMap, f64> { diff --git a/src/derive/strandedness.rs b/src/derive/strandedness.rs new file mode 100644 index 0000000..0551408 --- /dev/null +++ b/src/derive/strandedness.rs @@ -0,0 +1,3 @@ +//! Supporting functionality for the `ngs derive strandedness` subcommand. + +pub mod compute; diff --git a/src/derive/strandedness/compute.rs b/src/derive/strandedness/compute.rs new file mode 100644 index 0000000..52e7dbc --- /dev/null +++ b/src/derive/strandedness/compute.rs @@ -0,0 +1,344 @@ +//! Module holding the logic for computing the strandedness. + +use anyhow::bail; +use noodles::bam; +use noodles::core::{Position, Region}; +use noodles::gff; +use noodles::sam; +use rand::Rng; +use rust_lapper::{Interval, Lapper}; +use serde::Serialize; +use std::collections::HashMap; + +/// General gene metrics that are tallied as a part of the +/// strandedness subcommand. +#[derive(Clone, Default, Serialize)] +pub struct GeneRecordMetrics { + /// The total number of genes found in the GFF. + pub total: usize, + + /// The number of genes that were found to be protein coding. + /// If --all-genes is set this will not be tallied. + pub protein_coding: usize, + + /// The number of genes which were discarded due to having + /// exons on both strands. + pub exons_on_both_strands: usize, + + /// The number of genes which were discarded due to not having + /// enough reads. + pub not_enough_reads: usize, +} + +/// General exon metrics that are tallied as a part of the +/// strandedness subcommand. +#[derive(Clone, Default, Serialize)] +pub struct ExonRecordMetrics { + /// The total number of exons found in the GFF. + pub total: usize, +} + +/// General read record metrics that are tallied as a part of the +/// strandedness subcommand. +#[derive(Clone, Default, Serialize)] +pub struct ReadRecordMetrics { + /// The number of records that have been filtered because of their flags. + /// (i.e. they were qc_fail, duplicates, secondary, or supplementary) + /// These conditions can be toggled on/off with CL flags + pub ignored_flags: usize, + + /// The number of records that have been filtered because + /// they failed the MAPQ filter. + pub low_mapq: usize, + + /// The number of records whose MAPQ couldn't be parsed and were thus discarded. + pub missing_mapq: usize, + + /// The number of records determined to be Paired-End. + pub paired_end_reads: usize, + + /// The number of records determined to be Single-End. + pub single_end_reads: usize, +} + +/// Struct for tracking count results. +#[derive(Clone, Default)] +struct Counts { + /// The number of reads determined to be Paired-End. + paired_end_reads: usize, + + /// The number of reads determined to be Single-End. + single_end_reads: usize, + + /// The number of reads that are evidence of Forward Strandedness. + forward: usize, + + /// The number of reads that are evidence of Reverse Strandedness. + reverse: usize, +} + +/// Struct holding the per read group results for an `ngs derive strandedness` +/// subcommand call. +#[derive(Debug, Serialize)] +pub struct ReadGroupDerivedStrandednessResult { + /// Name of the read group. + pub read_group: String, + + /// Whether or not strandedness was determined for this read group. + pub succeeded: bool, + + /// The strandedness of this read group or "Inconclusive". + pub strandedness: String, + + /// The total number of reads in this read group. + pub total: usize, + + /// The number of reads that are evidence of Forward Strandedness. + pub forward: usize, + + /// The number of reads that are evidence of Reverse Strandedness. + pub reverse: usize, + + /// The percent of evidence for Forward Strandedness. + pub forward_pct: f64, + + /// The percent of evidence for Reverse Strandedness. + pub reverse_pct: f64, +} + +impl ReadGroupDerivedStrandednessResult { + /// Creates a new [`ReadGroupDerivedStrandednessResult`]. + fn new( + read_group: String, + succeeded: bool, + strandedness: String, + forward: usize, + reverse: usize, + ) -> Self { + ReadGroupDerivedStrandednessResult { + read_group, + succeeded, + strandedness, + total: forward + reverse, + forward, + reverse, + forward_pct: (forward as f64 / (forward + reverse) as f64) * 100.0, + reverse_pct: (reverse as f64 / (forward + reverse) as f64) * 100.0, + } + } +} + +/// Struct holding the final results for an `ngs derive strandedness` subcommand +/// call. +#[derive(Debug, Serialize)] +pub struct DerivedStrandednessResult { + /// Whether or not the `ngs derive strandedness` subcommand succeeded. + pub succeeded: bool, + + /// The strandedness of this read group or "Inconclusive". + pub strandedness: String, + + /// The total number of reads. + pub total: usize, + + /// The number of reads that are evidence of Forward Strandedness. + pub forward: usize, + + /// The number of reads that are evidence of Reverse Strandedness. + pub reverse: usize, + + /// The percent of evidence for Forward Strandedness. + pub forward_pct: f64, + + /// The percent of evidence for Reverse Strandedness. + pub reverse_pct: f64, + + /// Vector of [`ReadGroupDerivedStrandednessResult`]s. + /// One for each read group in the BAM, + /// and potentially one for any reads with an unknown read group. + pub read_groups: Vec, +} + +impl DerivedStrandednessResult { + /// Creates a new [`DerivedStrandednessResult`]. + fn new( + succeeded: bool, + strandedness: String, + forward: usize, + reverse: usize, + read_groups: Vec, + ) -> Self { + DerivedStrandednessResult { + succeeded, + strandedness, + total: forward + reverse, + forward, + reverse, + forward_pct: (forward as f64 / (forward + reverse) as f64) * 100.0, + reverse_pct: (reverse as f64 / (forward + reverse) as f64) * 100.0, + read_groups, + } + } +} + +/// Struct holding the parsed BAM file and its index. +pub struct ParsedBAMFile { + pub reader: bam::Reader>, + pub header: sam::Header, + pub index: bam::bai::Index, +} + +/// Filters defining how to calculate strandedness. +pub struct StrandednessFilters { + /// Minimum number of reads mapped to a gene to be considered + /// for evidence of strandedness. + pub min_reads_per_gene: usize, + + /// Minumum mapping quality for a record to be considered. + /// 0 if MAPQ shouldn't be considered. + pub min_mapq: u8, + + /// Allow qc failed reads to be counted. + pub count_qc_failed: bool, + + /// Do not count supplementary alignments. + pub no_supplementary: bool, + + /// Do count secondary alignments. + pub count_secondary: bool, + + /// Do count duplicates. + pub count_duplicates: bool, +} + +fn disqualify_gene( + gene: &gff::Record, + exons: &HashMap<&str, Lapper>, +) -> bool { + let gene_strand = gene.strand(); + let mut all_on_same_strand = true; + let mut at_least_one_exon = false; + + if let Some(intervals) = exons.get(gene.reference_sequence_name()) { + for exon in intervals.find(gene.start().into(), gene.end().into()) { + at_least_one_exon = true; + if exon.val != gene_strand { + all_on_same_strand = false; + break; + } + } + } + + if all_on_same_strand && at_least_one_exon { + return false; + } + true +} + +fn query_filtered_reads( + parsed_bam: &ParsedBAMFile, + gene: &gff::Record, + filters: &StrandednessFilters, + read_metrics: &mut ReadRecordMetrics, +) -> Vec { + let start = Position::from(gene.start()); + let end = Position::from(gene.end()); + let region = Region::new(gene.reference_sequence_name(), start..=end); + + let mut filtered_reads = Vec::new(); + + let query = parsed_bam + .reader + .query(&parsed_bam.header, &parsed_bam.index, ®ion) + .unwrap(); + for read in query { + let read = read.unwrap(); + + // (1) Parse the flags so we can see if the read should be discarded. + let flags = read.flags(); + if (!filters.count_qc_failed && flags.is_qc_fail()) + || (filters.no_supplementary && flags.is_supplementary()) + || (!filters.count_secondary && flags.is_secondary()) + || (!filters.count_duplicates && flags.is_duplicate()) + { + read_metrics.ignored_flags += 1; + continue; + } + + // (2) If the user is filtering by MAPQ, check if this read passes. + if filters.min_mapq > 0 { + match read.mapping_quality() { + Some(mapq) => { + if mapq.get() < filters.min_mapq { + read_metrics.low_mapq += 1; + continue; + } + } + None => { + read_metrics.missing_mapq += 1; + continue; + } + } + } + + filtered_reads.push(read); + } + + if filtered_reads.len() < filters.min_reads_per_gene { + filtered_reads.clear(); + } + + return filtered_reads; +} + +// fn classify_read( +// read: &sam::alignment::Record, +// gene_strand: &gff::record::Strand, +// ) -> { +// // TODO +// } + +/// Main method to evaluate the observed strand state and +/// return a result for the derived strandedness. This may fail, and the +/// resulting [`DerivedStrandednessResult`] should be evaluated accordingly. +pub fn predict( + parsed_bam: &ParsedBAMFile, + gene_records: &mut Vec, + exons: &HashMap<&str, Lapper>, + max_iterations_per_try: usize, + num_genes: usize, + filters: &StrandednessFilters, + gene_metrics: &mut GeneRecordMetrics, + exon_metrics: &mut ExonRecordMetrics, +) -> Result { + let rng = rand::thread_rng(); + let mut num_tested_genes: usize = 0; + let mut read_metrics = ReadRecordMetrics::default(); + + for _ in 0..max_iterations_per_try { + if num_tested_genes > num_genes { + break; + } + + let cur_gene = gene_records.swap_remove(rng.gen_range(0..gene_records.len())); + + if disqualify_gene(&cur_gene, exons) { + gene_metrics.exons_on_both_strands += 1; + continue; + } + + let mut enough_reads = false; + for read in query_filtered_reads(parsed_bam, &cur_gene, filters, &mut read_metrics) { + enough_reads = true; + + // TODO classify_read(&read, &cur_gene.strand()); + } + if enough_reads { + num_tested_genes += 1; + } else { + gene_metrics.not_enough_reads += 1; + } + } + + anyhow::Ok(result) +} diff --git a/src/main.rs b/src/main.rs index b2312e1..6f5ced0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -101,6 +101,9 @@ fn main() -> anyhow::Result<()> { derive::command::DeriveSubcommand::Readlen(args) => { derive::command::readlen::derive(args)? } + derive::command::DeriveSubcommand::Strandedness(args) => { + derive::command::strandedness::derive(args)? + } derive::command::DeriveSubcommand::JunctionAnnotation(args) => { derive::command::junction_annotation::derive(args)? } diff --git a/src/utils.rs b/src/utils.rs index 9a33a4e..8f0207c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -8,3 +8,4 @@ pub mod formats; pub mod genome; pub mod histogram; pub mod pathbuf; +pub mod read_groups; diff --git a/src/utils/read_groups.rs b/src/utils/read_groups.rs new file mode 100644 index 0000000..bbbdee9 --- /dev/null +++ b/src/utils/read_groups.rs @@ -0,0 +1,54 @@ +use noodles::sam::header; +use std::collections::HashSet; +use std::sync::Arc; +use tracing::warn; + +use lazy_static::lazy_static; + +// Strings used to index into the HashMaps used to store the Read Group ordering flags. +// Lazy statics are used to save memory. +lazy_static! { + /// String used to index into the HashMaps used to store the "overall" ordering flags. + pub static ref OVERALL: Arc = Arc::new(String::from("overall")); + + /// String used to index into th HashMaps used to store the "unknown_read_group" ordering flags. + pub static ref UNKNOWN_READ_GROUP: Arc = Arc::new(String::from("unknown_read_group")); +} + +/// Compares the read group tags found in the records +/// and the read groups found in the header. +/// Returns a vector of read group names that were found in the header +/// but not in the records. +pub fn validate_read_group_info( + found_rgs: &HashSet>, + header: &header::Header, +) -> Vec { + let mut rgs_in_header_not_records = Vec::new(); + let mut rgs_in_records_not_header = Vec::new(); + + for (rg_id, _) in header.read_groups() { + if !found_rgs.contains(rg_id) { + rgs_in_header_not_records.push(rg_id.to_string()); + } + } + if !rgs_in_header_not_records.is_empty() { + warn!( + "The following read groups were not found in the file: {:?}", + rgs_in_header_not_records + ); + } + + for rg_id in found_rgs { + if !header.read_groups().contains_key(rg_id.as_str()) { + rgs_in_records_not_header.push(rg_id.to_string()); + } + } + if !rgs_in_records_not_header.is_empty() { + warn!( + "The following read groups were not found in the header: {:?}", + rgs_in_records_not_header + ); + } + + rgs_in_header_not_records +}